2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwNone_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
95 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
97 __m256d dummy_mask,cutoff_mask;
98 __m128 tmpmask0,tmpmask1;
99 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
100 __m256d one = _mm256_set1_pd(1.0);
101 __m256d two = _mm256_set1_pd(2.0);
107 jindex = nlist->jindex;
109 shiftidx = nlist->shift;
111 shiftvec = fr->shift_vec[0];
112 fshift = fr->fshift[0];
113 facel = _mm256_set1_pd(fr->epsfac);
114 charge = mdatoms->chargeA;
116 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
117 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
118 beta2 = _mm256_mul_pd(beta,beta);
119 beta3 = _mm256_mul_pd(beta,beta2);
121 ewtab = fr->ic->tabq_coul_FDV0;
122 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
123 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
125 /* Setup water-specific parameters */
126 inr = nlist->iinr[0];
127 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
128 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
129 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131 jq0 = _mm256_set1_pd(charge[inr+0]);
132 jq1 = _mm256_set1_pd(charge[inr+1]);
133 jq2 = _mm256_set1_pd(charge[inr+2]);
134 qq00 = _mm256_mul_pd(iq0,jq0);
135 qq01 = _mm256_mul_pd(iq0,jq1);
136 qq02 = _mm256_mul_pd(iq0,jq2);
137 qq10 = _mm256_mul_pd(iq1,jq0);
138 qq11 = _mm256_mul_pd(iq1,jq1);
139 qq12 = _mm256_mul_pd(iq1,jq2);
140 qq20 = _mm256_mul_pd(iq2,jq0);
141 qq21 = _mm256_mul_pd(iq2,jq1);
142 qq22 = _mm256_mul_pd(iq2,jq2);
144 /* Avoid stupid compiler warnings */
145 jnrA = jnrB = jnrC = jnrD = 0;
154 for(iidx=0;iidx<4*DIM;iidx++)
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
175 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
177 fix0 = _mm256_setzero_pd();
178 fiy0 = _mm256_setzero_pd();
179 fiz0 = _mm256_setzero_pd();
180 fix1 = _mm256_setzero_pd();
181 fiy1 = _mm256_setzero_pd();
182 fiz1 = _mm256_setzero_pd();
183 fix2 = _mm256_setzero_pd();
184 fiy2 = _mm256_setzero_pd();
185 fiz2 = _mm256_setzero_pd();
187 /* Reset potential sums */
188 velecsum = _mm256_setzero_pd();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
194 /* Get j neighbor index, and coordinate index */
199 j_coord_offsetA = DIM*jnrA;
200 j_coord_offsetB = DIM*jnrB;
201 j_coord_offsetC = DIM*jnrC;
202 j_coord_offsetD = DIM*jnrD;
204 /* load j atom coordinates */
205 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
206 x+j_coord_offsetC,x+j_coord_offsetD,
207 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
209 /* Calculate displacement vector */
210 dx00 = _mm256_sub_pd(ix0,jx0);
211 dy00 = _mm256_sub_pd(iy0,jy0);
212 dz00 = _mm256_sub_pd(iz0,jz0);
213 dx01 = _mm256_sub_pd(ix0,jx1);
214 dy01 = _mm256_sub_pd(iy0,jy1);
215 dz01 = _mm256_sub_pd(iz0,jz1);
216 dx02 = _mm256_sub_pd(ix0,jx2);
217 dy02 = _mm256_sub_pd(iy0,jy2);
218 dz02 = _mm256_sub_pd(iz0,jz2);
219 dx10 = _mm256_sub_pd(ix1,jx0);
220 dy10 = _mm256_sub_pd(iy1,jy0);
221 dz10 = _mm256_sub_pd(iz1,jz0);
222 dx11 = _mm256_sub_pd(ix1,jx1);
223 dy11 = _mm256_sub_pd(iy1,jy1);
224 dz11 = _mm256_sub_pd(iz1,jz1);
225 dx12 = _mm256_sub_pd(ix1,jx2);
226 dy12 = _mm256_sub_pd(iy1,jy2);
227 dz12 = _mm256_sub_pd(iz1,jz2);
228 dx20 = _mm256_sub_pd(ix2,jx0);
229 dy20 = _mm256_sub_pd(iy2,jy0);
230 dz20 = _mm256_sub_pd(iz2,jz0);
231 dx21 = _mm256_sub_pd(ix2,jx1);
232 dy21 = _mm256_sub_pd(iy2,jy1);
233 dz21 = _mm256_sub_pd(iz2,jz1);
234 dx22 = _mm256_sub_pd(ix2,jx2);
235 dy22 = _mm256_sub_pd(iy2,jy2);
236 dz22 = _mm256_sub_pd(iz2,jz2);
238 /* Calculate squared distance and things based on it */
239 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
240 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
241 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
242 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
243 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
244 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
245 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
246 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
247 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
249 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
250 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
251 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
252 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
253 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
254 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
255 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
256 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
257 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
259 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
260 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
261 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
262 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
263 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
264 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
265 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
266 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
267 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
269 fjx0 = _mm256_setzero_pd();
270 fjy0 = _mm256_setzero_pd();
271 fjz0 = _mm256_setzero_pd();
272 fjx1 = _mm256_setzero_pd();
273 fjy1 = _mm256_setzero_pd();
274 fjz1 = _mm256_setzero_pd();
275 fjx2 = _mm256_setzero_pd();
276 fjy2 = _mm256_setzero_pd();
277 fjz2 = _mm256_setzero_pd();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 r00 = _mm256_mul_pd(rsq00,rinv00);
285 /* EWALD ELECTROSTATICS */
287 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
288 ewrt = _mm256_mul_pd(r00,ewtabscale);
289 ewitab = _mm256_cvttpd_epi32(ewrt);
290 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
291 ewitab = _mm_slli_epi32(ewitab,2);
292 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
293 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
294 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
295 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
296 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
297 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
298 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
299 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
300 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
302 /* Update potential sum for this i atom from the interaction with this j atom. */
303 velecsum = _mm256_add_pd(velecsum,velec);
307 /* Calculate temporary vectorial force */
308 tx = _mm256_mul_pd(fscal,dx00);
309 ty = _mm256_mul_pd(fscal,dy00);
310 tz = _mm256_mul_pd(fscal,dz00);
312 /* Update vectorial force */
313 fix0 = _mm256_add_pd(fix0,tx);
314 fiy0 = _mm256_add_pd(fiy0,ty);
315 fiz0 = _mm256_add_pd(fiz0,tz);
317 fjx0 = _mm256_add_pd(fjx0,tx);
318 fjy0 = _mm256_add_pd(fjy0,ty);
319 fjz0 = _mm256_add_pd(fjz0,tz);
321 /**************************
322 * CALCULATE INTERACTIONS *
323 **************************/
325 r01 = _mm256_mul_pd(rsq01,rinv01);
327 /* EWALD ELECTROSTATICS */
329 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
330 ewrt = _mm256_mul_pd(r01,ewtabscale);
331 ewitab = _mm256_cvttpd_epi32(ewrt);
332 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
333 ewitab = _mm_slli_epi32(ewitab,2);
334 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
335 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
336 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
337 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
338 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
339 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
340 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
341 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
342 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
344 /* Update potential sum for this i atom from the interaction with this j atom. */
345 velecsum = _mm256_add_pd(velecsum,velec);
349 /* Calculate temporary vectorial force */
350 tx = _mm256_mul_pd(fscal,dx01);
351 ty = _mm256_mul_pd(fscal,dy01);
352 tz = _mm256_mul_pd(fscal,dz01);
354 /* Update vectorial force */
355 fix0 = _mm256_add_pd(fix0,tx);
356 fiy0 = _mm256_add_pd(fiy0,ty);
357 fiz0 = _mm256_add_pd(fiz0,tz);
359 fjx1 = _mm256_add_pd(fjx1,tx);
360 fjy1 = _mm256_add_pd(fjy1,ty);
361 fjz1 = _mm256_add_pd(fjz1,tz);
363 /**************************
364 * CALCULATE INTERACTIONS *
365 **************************/
367 r02 = _mm256_mul_pd(rsq02,rinv02);
369 /* EWALD ELECTROSTATICS */
371 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
372 ewrt = _mm256_mul_pd(r02,ewtabscale);
373 ewitab = _mm256_cvttpd_epi32(ewrt);
374 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
375 ewitab = _mm_slli_epi32(ewitab,2);
376 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
377 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
378 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
379 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
380 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
381 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
382 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
383 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
384 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
386 /* Update potential sum for this i atom from the interaction with this j atom. */
387 velecsum = _mm256_add_pd(velecsum,velec);
391 /* Calculate temporary vectorial force */
392 tx = _mm256_mul_pd(fscal,dx02);
393 ty = _mm256_mul_pd(fscal,dy02);
394 tz = _mm256_mul_pd(fscal,dz02);
396 /* Update vectorial force */
397 fix0 = _mm256_add_pd(fix0,tx);
398 fiy0 = _mm256_add_pd(fiy0,ty);
399 fiz0 = _mm256_add_pd(fiz0,tz);
401 fjx2 = _mm256_add_pd(fjx2,tx);
402 fjy2 = _mm256_add_pd(fjy2,ty);
403 fjz2 = _mm256_add_pd(fjz2,tz);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 r10 = _mm256_mul_pd(rsq10,rinv10);
411 /* EWALD ELECTROSTATICS */
413 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
414 ewrt = _mm256_mul_pd(r10,ewtabscale);
415 ewitab = _mm256_cvttpd_epi32(ewrt);
416 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
417 ewitab = _mm_slli_epi32(ewitab,2);
418 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
419 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
420 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
421 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
422 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
423 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
424 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
425 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
426 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velecsum = _mm256_add_pd(velecsum,velec);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_pd(fscal,dx10);
435 ty = _mm256_mul_pd(fscal,dy10);
436 tz = _mm256_mul_pd(fscal,dz10);
438 /* Update vectorial force */
439 fix1 = _mm256_add_pd(fix1,tx);
440 fiy1 = _mm256_add_pd(fiy1,ty);
441 fiz1 = _mm256_add_pd(fiz1,tz);
443 fjx0 = _mm256_add_pd(fjx0,tx);
444 fjy0 = _mm256_add_pd(fjy0,ty);
445 fjz0 = _mm256_add_pd(fjz0,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 r11 = _mm256_mul_pd(rsq11,rinv11);
453 /* EWALD ELECTROSTATICS */
455 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
456 ewrt = _mm256_mul_pd(r11,ewtabscale);
457 ewitab = _mm256_cvttpd_epi32(ewrt);
458 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
459 ewitab = _mm_slli_epi32(ewitab,2);
460 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
461 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
462 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
463 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
464 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
465 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
466 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
467 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
468 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm256_add_pd(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_pd(fscal,dx11);
477 ty = _mm256_mul_pd(fscal,dy11);
478 tz = _mm256_mul_pd(fscal,dz11);
480 /* Update vectorial force */
481 fix1 = _mm256_add_pd(fix1,tx);
482 fiy1 = _mm256_add_pd(fiy1,ty);
483 fiz1 = _mm256_add_pd(fiz1,tz);
485 fjx1 = _mm256_add_pd(fjx1,tx);
486 fjy1 = _mm256_add_pd(fjy1,ty);
487 fjz1 = _mm256_add_pd(fjz1,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 r12 = _mm256_mul_pd(rsq12,rinv12);
495 /* EWALD ELECTROSTATICS */
497 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
498 ewrt = _mm256_mul_pd(r12,ewtabscale);
499 ewitab = _mm256_cvttpd_epi32(ewrt);
500 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
501 ewitab = _mm_slli_epi32(ewitab,2);
502 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
503 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
504 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
505 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
506 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
507 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
508 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
509 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
510 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
512 /* Update potential sum for this i atom from the interaction with this j atom. */
513 velecsum = _mm256_add_pd(velecsum,velec);
517 /* Calculate temporary vectorial force */
518 tx = _mm256_mul_pd(fscal,dx12);
519 ty = _mm256_mul_pd(fscal,dy12);
520 tz = _mm256_mul_pd(fscal,dz12);
522 /* Update vectorial force */
523 fix1 = _mm256_add_pd(fix1,tx);
524 fiy1 = _mm256_add_pd(fiy1,ty);
525 fiz1 = _mm256_add_pd(fiz1,tz);
527 fjx2 = _mm256_add_pd(fjx2,tx);
528 fjy2 = _mm256_add_pd(fjy2,ty);
529 fjz2 = _mm256_add_pd(fjz2,tz);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 r20 = _mm256_mul_pd(rsq20,rinv20);
537 /* EWALD ELECTROSTATICS */
539 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
540 ewrt = _mm256_mul_pd(r20,ewtabscale);
541 ewitab = _mm256_cvttpd_epi32(ewrt);
542 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
543 ewitab = _mm_slli_epi32(ewitab,2);
544 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
545 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
546 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
547 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
548 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
549 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
550 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
551 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
552 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velecsum = _mm256_add_pd(velecsum,velec);
559 /* Calculate temporary vectorial force */
560 tx = _mm256_mul_pd(fscal,dx20);
561 ty = _mm256_mul_pd(fscal,dy20);
562 tz = _mm256_mul_pd(fscal,dz20);
564 /* Update vectorial force */
565 fix2 = _mm256_add_pd(fix2,tx);
566 fiy2 = _mm256_add_pd(fiy2,ty);
567 fiz2 = _mm256_add_pd(fiz2,tz);
569 fjx0 = _mm256_add_pd(fjx0,tx);
570 fjy0 = _mm256_add_pd(fjy0,ty);
571 fjz0 = _mm256_add_pd(fjz0,tz);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 r21 = _mm256_mul_pd(rsq21,rinv21);
579 /* EWALD ELECTROSTATICS */
581 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
582 ewrt = _mm256_mul_pd(r21,ewtabscale);
583 ewitab = _mm256_cvttpd_epi32(ewrt);
584 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
585 ewitab = _mm_slli_epi32(ewitab,2);
586 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
587 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
588 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
589 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
590 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
591 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
592 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
593 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
594 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
596 /* Update potential sum for this i atom from the interaction with this j atom. */
597 velecsum = _mm256_add_pd(velecsum,velec);
601 /* Calculate temporary vectorial force */
602 tx = _mm256_mul_pd(fscal,dx21);
603 ty = _mm256_mul_pd(fscal,dy21);
604 tz = _mm256_mul_pd(fscal,dz21);
606 /* Update vectorial force */
607 fix2 = _mm256_add_pd(fix2,tx);
608 fiy2 = _mm256_add_pd(fiy2,ty);
609 fiz2 = _mm256_add_pd(fiz2,tz);
611 fjx1 = _mm256_add_pd(fjx1,tx);
612 fjy1 = _mm256_add_pd(fjy1,ty);
613 fjz1 = _mm256_add_pd(fjz1,tz);
615 /**************************
616 * CALCULATE INTERACTIONS *
617 **************************/
619 r22 = _mm256_mul_pd(rsq22,rinv22);
621 /* EWALD ELECTROSTATICS */
623 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
624 ewrt = _mm256_mul_pd(r22,ewtabscale);
625 ewitab = _mm256_cvttpd_epi32(ewrt);
626 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
627 ewitab = _mm_slli_epi32(ewitab,2);
628 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
629 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
630 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
631 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
632 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
633 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
634 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
635 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
636 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velecsum = _mm256_add_pd(velecsum,velec);
643 /* Calculate temporary vectorial force */
644 tx = _mm256_mul_pd(fscal,dx22);
645 ty = _mm256_mul_pd(fscal,dy22);
646 tz = _mm256_mul_pd(fscal,dz22);
648 /* Update vectorial force */
649 fix2 = _mm256_add_pd(fix2,tx);
650 fiy2 = _mm256_add_pd(fiy2,ty);
651 fiz2 = _mm256_add_pd(fiz2,tz);
653 fjx2 = _mm256_add_pd(fjx2,tx);
654 fjy2 = _mm256_add_pd(fjy2,ty);
655 fjz2 = _mm256_add_pd(fjz2,tz);
657 fjptrA = f+j_coord_offsetA;
658 fjptrB = f+j_coord_offsetB;
659 fjptrC = f+j_coord_offsetC;
660 fjptrD = f+j_coord_offsetD;
662 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
663 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
665 /* Inner loop uses 369 flops */
671 /* Get j neighbor index, and coordinate index */
672 jnrlistA = jjnr[jidx];
673 jnrlistB = jjnr[jidx+1];
674 jnrlistC = jjnr[jidx+2];
675 jnrlistD = jjnr[jidx+3];
676 /* Sign of each element will be negative for non-real atoms.
677 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
678 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
680 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
682 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
683 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
684 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
686 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
687 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
688 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
689 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
690 j_coord_offsetA = DIM*jnrA;
691 j_coord_offsetB = DIM*jnrB;
692 j_coord_offsetC = DIM*jnrC;
693 j_coord_offsetD = DIM*jnrD;
695 /* load j atom coordinates */
696 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
697 x+j_coord_offsetC,x+j_coord_offsetD,
698 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
700 /* Calculate displacement vector */
701 dx00 = _mm256_sub_pd(ix0,jx0);
702 dy00 = _mm256_sub_pd(iy0,jy0);
703 dz00 = _mm256_sub_pd(iz0,jz0);
704 dx01 = _mm256_sub_pd(ix0,jx1);
705 dy01 = _mm256_sub_pd(iy0,jy1);
706 dz01 = _mm256_sub_pd(iz0,jz1);
707 dx02 = _mm256_sub_pd(ix0,jx2);
708 dy02 = _mm256_sub_pd(iy0,jy2);
709 dz02 = _mm256_sub_pd(iz0,jz2);
710 dx10 = _mm256_sub_pd(ix1,jx0);
711 dy10 = _mm256_sub_pd(iy1,jy0);
712 dz10 = _mm256_sub_pd(iz1,jz0);
713 dx11 = _mm256_sub_pd(ix1,jx1);
714 dy11 = _mm256_sub_pd(iy1,jy1);
715 dz11 = _mm256_sub_pd(iz1,jz1);
716 dx12 = _mm256_sub_pd(ix1,jx2);
717 dy12 = _mm256_sub_pd(iy1,jy2);
718 dz12 = _mm256_sub_pd(iz1,jz2);
719 dx20 = _mm256_sub_pd(ix2,jx0);
720 dy20 = _mm256_sub_pd(iy2,jy0);
721 dz20 = _mm256_sub_pd(iz2,jz0);
722 dx21 = _mm256_sub_pd(ix2,jx1);
723 dy21 = _mm256_sub_pd(iy2,jy1);
724 dz21 = _mm256_sub_pd(iz2,jz1);
725 dx22 = _mm256_sub_pd(ix2,jx2);
726 dy22 = _mm256_sub_pd(iy2,jy2);
727 dz22 = _mm256_sub_pd(iz2,jz2);
729 /* Calculate squared distance and things based on it */
730 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
731 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
732 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
733 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
734 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
735 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
736 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
737 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
738 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
740 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
741 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
742 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
743 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
744 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
745 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
746 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
747 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
748 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
750 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
751 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
752 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
753 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
754 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
755 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
756 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
757 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
758 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
760 fjx0 = _mm256_setzero_pd();
761 fjy0 = _mm256_setzero_pd();
762 fjz0 = _mm256_setzero_pd();
763 fjx1 = _mm256_setzero_pd();
764 fjy1 = _mm256_setzero_pd();
765 fjz1 = _mm256_setzero_pd();
766 fjx2 = _mm256_setzero_pd();
767 fjy2 = _mm256_setzero_pd();
768 fjz2 = _mm256_setzero_pd();
770 /**************************
771 * CALCULATE INTERACTIONS *
772 **************************/
774 r00 = _mm256_mul_pd(rsq00,rinv00);
775 r00 = _mm256_andnot_pd(dummy_mask,r00);
777 /* EWALD ELECTROSTATICS */
779 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
780 ewrt = _mm256_mul_pd(r00,ewtabscale);
781 ewitab = _mm256_cvttpd_epi32(ewrt);
782 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
783 ewitab = _mm_slli_epi32(ewitab,2);
784 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
785 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
786 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
787 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
788 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
789 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
790 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
791 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
792 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
794 /* Update potential sum for this i atom from the interaction with this j atom. */
795 velec = _mm256_andnot_pd(dummy_mask,velec);
796 velecsum = _mm256_add_pd(velecsum,velec);
800 fscal = _mm256_andnot_pd(dummy_mask,fscal);
802 /* Calculate temporary vectorial force */
803 tx = _mm256_mul_pd(fscal,dx00);
804 ty = _mm256_mul_pd(fscal,dy00);
805 tz = _mm256_mul_pd(fscal,dz00);
807 /* Update vectorial force */
808 fix0 = _mm256_add_pd(fix0,tx);
809 fiy0 = _mm256_add_pd(fiy0,ty);
810 fiz0 = _mm256_add_pd(fiz0,tz);
812 fjx0 = _mm256_add_pd(fjx0,tx);
813 fjy0 = _mm256_add_pd(fjy0,ty);
814 fjz0 = _mm256_add_pd(fjz0,tz);
816 /**************************
817 * CALCULATE INTERACTIONS *
818 **************************/
820 r01 = _mm256_mul_pd(rsq01,rinv01);
821 r01 = _mm256_andnot_pd(dummy_mask,r01);
823 /* EWALD ELECTROSTATICS */
825 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
826 ewrt = _mm256_mul_pd(r01,ewtabscale);
827 ewitab = _mm256_cvttpd_epi32(ewrt);
828 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
829 ewitab = _mm_slli_epi32(ewitab,2);
830 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
831 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
832 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
833 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
834 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
835 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
836 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
837 velec = _mm256_mul_pd(qq01,_mm256_sub_pd(rinv01,velec));
838 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
840 /* Update potential sum for this i atom from the interaction with this j atom. */
841 velec = _mm256_andnot_pd(dummy_mask,velec);
842 velecsum = _mm256_add_pd(velecsum,velec);
846 fscal = _mm256_andnot_pd(dummy_mask,fscal);
848 /* Calculate temporary vectorial force */
849 tx = _mm256_mul_pd(fscal,dx01);
850 ty = _mm256_mul_pd(fscal,dy01);
851 tz = _mm256_mul_pd(fscal,dz01);
853 /* Update vectorial force */
854 fix0 = _mm256_add_pd(fix0,tx);
855 fiy0 = _mm256_add_pd(fiy0,ty);
856 fiz0 = _mm256_add_pd(fiz0,tz);
858 fjx1 = _mm256_add_pd(fjx1,tx);
859 fjy1 = _mm256_add_pd(fjy1,ty);
860 fjz1 = _mm256_add_pd(fjz1,tz);
862 /**************************
863 * CALCULATE INTERACTIONS *
864 **************************/
866 r02 = _mm256_mul_pd(rsq02,rinv02);
867 r02 = _mm256_andnot_pd(dummy_mask,r02);
869 /* EWALD ELECTROSTATICS */
871 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
872 ewrt = _mm256_mul_pd(r02,ewtabscale);
873 ewitab = _mm256_cvttpd_epi32(ewrt);
874 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
875 ewitab = _mm_slli_epi32(ewitab,2);
876 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
877 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
878 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
879 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
880 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
881 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
882 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
883 velec = _mm256_mul_pd(qq02,_mm256_sub_pd(rinv02,velec));
884 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm256_andnot_pd(dummy_mask,velec);
888 velecsum = _mm256_add_pd(velecsum,velec);
892 fscal = _mm256_andnot_pd(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm256_mul_pd(fscal,dx02);
896 ty = _mm256_mul_pd(fscal,dy02);
897 tz = _mm256_mul_pd(fscal,dz02);
899 /* Update vectorial force */
900 fix0 = _mm256_add_pd(fix0,tx);
901 fiy0 = _mm256_add_pd(fiy0,ty);
902 fiz0 = _mm256_add_pd(fiz0,tz);
904 fjx2 = _mm256_add_pd(fjx2,tx);
905 fjy2 = _mm256_add_pd(fjy2,ty);
906 fjz2 = _mm256_add_pd(fjz2,tz);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 r10 = _mm256_mul_pd(rsq10,rinv10);
913 r10 = _mm256_andnot_pd(dummy_mask,r10);
915 /* EWALD ELECTROSTATICS */
917 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
918 ewrt = _mm256_mul_pd(r10,ewtabscale);
919 ewitab = _mm256_cvttpd_epi32(ewrt);
920 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
921 ewitab = _mm_slli_epi32(ewitab,2);
922 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
923 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
924 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
925 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
926 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
927 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
928 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
929 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
930 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
932 /* Update potential sum for this i atom from the interaction with this j atom. */
933 velec = _mm256_andnot_pd(dummy_mask,velec);
934 velecsum = _mm256_add_pd(velecsum,velec);
938 fscal = _mm256_andnot_pd(dummy_mask,fscal);
940 /* Calculate temporary vectorial force */
941 tx = _mm256_mul_pd(fscal,dx10);
942 ty = _mm256_mul_pd(fscal,dy10);
943 tz = _mm256_mul_pd(fscal,dz10);
945 /* Update vectorial force */
946 fix1 = _mm256_add_pd(fix1,tx);
947 fiy1 = _mm256_add_pd(fiy1,ty);
948 fiz1 = _mm256_add_pd(fiz1,tz);
950 fjx0 = _mm256_add_pd(fjx0,tx);
951 fjy0 = _mm256_add_pd(fjy0,ty);
952 fjz0 = _mm256_add_pd(fjz0,tz);
954 /**************************
955 * CALCULATE INTERACTIONS *
956 **************************/
958 r11 = _mm256_mul_pd(rsq11,rinv11);
959 r11 = _mm256_andnot_pd(dummy_mask,r11);
961 /* EWALD ELECTROSTATICS */
963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
964 ewrt = _mm256_mul_pd(r11,ewtabscale);
965 ewitab = _mm256_cvttpd_epi32(ewrt);
966 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
967 ewitab = _mm_slli_epi32(ewitab,2);
968 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
969 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
970 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
971 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
972 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
973 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
974 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
975 velec = _mm256_mul_pd(qq11,_mm256_sub_pd(rinv11,velec));
976 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm256_andnot_pd(dummy_mask,velec);
980 velecsum = _mm256_add_pd(velecsum,velec);
984 fscal = _mm256_andnot_pd(dummy_mask,fscal);
986 /* Calculate temporary vectorial force */
987 tx = _mm256_mul_pd(fscal,dx11);
988 ty = _mm256_mul_pd(fscal,dy11);
989 tz = _mm256_mul_pd(fscal,dz11);
991 /* Update vectorial force */
992 fix1 = _mm256_add_pd(fix1,tx);
993 fiy1 = _mm256_add_pd(fiy1,ty);
994 fiz1 = _mm256_add_pd(fiz1,tz);
996 fjx1 = _mm256_add_pd(fjx1,tx);
997 fjy1 = _mm256_add_pd(fjy1,ty);
998 fjz1 = _mm256_add_pd(fjz1,tz);
1000 /**************************
1001 * CALCULATE INTERACTIONS *
1002 **************************/
1004 r12 = _mm256_mul_pd(rsq12,rinv12);
1005 r12 = _mm256_andnot_pd(dummy_mask,r12);
1007 /* EWALD ELECTROSTATICS */
1009 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1010 ewrt = _mm256_mul_pd(r12,ewtabscale);
1011 ewitab = _mm256_cvttpd_epi32(ewrt);
1012 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1013 ewitab = _mm_slli_epi32(ewitab,2);
1014 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1015 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1016 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1017 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1018 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1019 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1020 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1021 velec = _mm256_mul_pd(qq12,_mm256_sub_pd(rinv12,velec));
1022 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1024 /* Update potential sum for this i atom from the interaction with this j atom. */
1025 velec = _mm256_andnot_pd(dummy_mask,velec);
1026 velecsum = _mm256_add_pd(velecsum,velec);
1030 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1032 /* Calculate temporary vectorial force */
1033 tx = _mm256_mul_pd(fscal,dx12);
1034 ty = _mm256_mul_pd(fscal,dy12);
1035 tz = _mm256_mul_pd(fscal,dz12);
1037 /* Update vectorial force */
1038 fix1 = _mm256_add_pd(fix1,tx);
1039 fiy1 = _mm256_add_pd(fiy1,ty);
1040 fiz1 = _mm256_add_pd(fiz1,tz);
1042 fjx2 = _mm256_add_pd(fjx2,tx);
1043 fjy2 = _mm256_add_pd(fjy2,ty);
1044 fjz2 = _mm256_add_pd(fjz2,tz);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 r20 = _mm256_mul_pd(rsq20,rinv20);
1051 r20 = _mm256_andnot_pd(dummy_mask,r20);
1053 /* EWALD ELECTROSTATICS */
1055 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1056 ewrt = _mm256_mul_pd(r20,ewtabscale);
1057 ewitab = _mm256_cvttpd_epi32(ewrt);
1058 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1059 ewitab = _mm_slli_epi32(ewitab,2);
1060 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1061 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1062 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1063 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1064 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1065 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1066 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1067 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1068 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1070 /* Update potential sum for this i atom from the interaction with this j atom. */
1071 velec = _mm256_andnot_pd(dummy_mask,velec);
1072 velecsum = _mm256_add_pd(velecsum,velec);
1076 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1078 /* Calculate temporary vectorial force */
1079 tx = _mm256_mul_pd(fscal,dx20);
1080 ty = _mm256_mul_pd(fscal,dy20);
1081 tz = _mm256_mul_pd(fscal,dz20);
1083 /* Update vectorial force */
1084 fix2 = _mm256_add_pd(fix2,tx);
1085 fiy2 = _mm256_add_pd(fiy2,ty);
1086 fiz2 = _mm256_add_pd(fiz2,tz);
1088 fjx0 = _mm256_add_pd(fjx0,tx);
1089 fjy0 = _mm256_add_pd(fjy0,ty);
1090 fjz0 = _mm256_add_pd(fjz0,tz);
1092 /**************************
1093 * CALCULATE INTERACTIONS *
1094 **************************/
1096 r21 = _mm256_mul_pd(rsq21,rinv21);
1097 r21 = _mm256_andnot_pd(dummy_mask,r21);
1099 /* EWALD ELECTROSTATICS */
1101 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1102 ewrt = _mm256_mul_pd(r21,ewtabscale);
1103 ewitab = _mm256_cvttpd_epi32(ewrt);
1104 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1105 ewitab = _mm_slli_epi32(ewitab,2);
1106 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1107 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1108 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1109 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1110 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1111 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1112 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1113 velec = _mm256_mul_pd(qq21,_mm256_sub_pd(rinv21,velec));
1114 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1116 /* Update potential sum for this i atom from the interaction with this j atom. */
1117 velec = _mm256_andnot_pd(dummy_mask,velec);
1118 velecsum = _mm256_add_pd(velecsum,velec);
1122 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1124 /* Calculate temporary vectorial force */
1125 tx = _mm256_mul_pd(fscal,dx21);
1126 ty = _mm256_mul_pd(fscal,dy21);
1127 tz = _mm256_mul_pd(fscal,dz21);
1129 /* Update vectorial force */
1130 fix2 = _mm256_add_pd(fix2,tx);
1131 fiy2 = _mm256_add_pd(fiy2,ty);
1132 fiz2 = _mm256_add_pd(fiz2,tz);
1134 fjx1 = _mm256_add_pd(fjx1,tx);
1135 fjy1 = _mm256_add_pd(fjy1,ty);
1136 fjz1 = _mm256_add_pd(fjz1,tz);
1138 /**************************
1139 * CALCULATE INTERACTIONS *
1140 **************************/
1142 r22 = _mm256_mul_pd(rsq22,rinv22);
1143 r22 = _mm256_andnot_pd(dummy_mask,r22);
1145 /* EWALD ELECTROSTATICS */
1147 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1148 ewrt = _mm256_mul_pd(r22,ewtabscale);
1149 ewitab = _mm256_cvttpd_epi32(ewrt);
1150 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1151 ewitab = _mm_slli_epi32(ewitab,2);
1152 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1153 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1154 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1155 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1156 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1157 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1158 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1159 velec = _mm256_mul_pd(qq22,_mm256_sub_pd(rinv22,velec));
1160 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1162 /* Update potential sum for this i atom from the interaction with this j atom. */
1163 velec = _mm256_andnot_pd(dummy_mask,velec);
1164 velecsum = _mm256_add_pd(velecsum,velec);
1168 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1170 /* Calculate temporary vectorial force */
1171 tx = _mm256_mul_pd(fscal,dx22);
1172 ty = _mm256_mul_pd(fscal,dy22);
1173 tz = _mm256_mul_pd(fscal,dz22);
1175 /* Update vectorial force */
1176 fix2 = _mm256_add_pd(fix2,tx);
1177 fiy2 = _mm256_add_pd(fiy2,ty);
1178 fiz2 = _mm256_add_pd(fiz2,tz);
1180 fjx2 = _mm256_add_pd(fjx2,tx);
1181 fjy2 = _mm256_add_pd(fjy2,ty);
1182 fjz2 = _mm256_add_pd(fjz2,tz);
1184 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1185 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1186 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1187 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1189 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1190 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1192 /* Inner loop uses 378 flops */
1195 /* End of innermost loop */
1197 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1198 f+i_coord_offset,fshift+i_shift_offset);
1201 /* Update potential energies */
1202 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1204 /* Increment number of inner iterations */
1205 inneriter += j_index_end - j_index_start;
1207 /* Outer loop uses 19 flops */
1210 /* Increment number of outer iterations */
1213 /* Update outer/inner flops */
1215 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*378);
1218 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
1219 * Electrostatics interaction: Ewald
1220 * VdW interaction: None
1221 * Geometry: Water3-Water3
1222 * Calculate force/pot: Force
1225 nb_kernel_ElecEw_VdwNone_GeomW3W3_F_avx_256_double
1226 (t_nblist * gmx_restrict nlist,
1227 rvec * gmx_restrict xx,
1228 rvec * gmx_restrict ff,
1229 t_forcerec * gmx_restrict fr,
1230 t_mdatoms * gmx_restrict mdatoms,
1231 nb_kernel_data_t * gmx_restrict kernel_data,
1232 t_nrnb * gmx_restrict nrnb)
1234 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1235 * just 0 for non-waters.
1236 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1237 * jnr indices corresponding to data put in the four positions in the SIMD register.
1239 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1240 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1241 int jnrA,jnrB,jnrC,jnrD;
1242 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1243 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1244 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1245 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1246 real rcutoff_scalar;
1247 real *shiftvec,*fshift,*x,*f;
1248 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1249 real scratch[4*DIM];
1250 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1251 real * vdwioffsetptr0;
1252 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1253 real * vdwioffsetptr1;
1254 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1255 real * vdwioffsetptr2;
1256 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1257 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1258 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1259 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1260 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1261 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1262 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1263 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1264 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1265 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1266 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1267 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1268 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1269 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1270 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1271 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1272 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1275 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1276 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1278 __m256d dummy_mask,cutoff_mask;
1279 __m128 tmpmask0,tmpmask1;
1280 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1281 __m256d one = _mm256_set1_pd(1.0);
1282 __m256d two = _mm256_set1_pd(2.0);
1288 jindex = nlist->jindex;
1290 shiftidx = nlist->shift;
1292 shiftvec = fr->shift_vec[0];
1293 fshift = fr->fshift[0];
1294 facel = _mm256_set1_pd(fr->epsfac);
1295 charge = mdatoms->chargeA;
1297 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
1298 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
1299 beta2 = _mm256_mul_pd(beta,beta);
1300 beta3 = _mm256_mul_pd(beta,beta2);
1302 ewtab = fr->ic->tabq_coul_F;
1303 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
1304 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
1306 /* Setup water-specific parameters */
1307 inr = nlist->iinr[0];
1308 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1309 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1310 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1312 jq0 = _mm256_set1_pd(charge[inr+0]);
1313 jq1 = _mm256_set1_pd(charge[inr+1]);
1314 jq2 = _mm256_set1_pd(charge[inr+2]);
1315 qq00 = _mm256_mul_pd(iq0,jq0);
1316 qq01 = _mm256_mul_pd(iq0,jq1);
1317 qq02 = _mm256_mul_pd(iq0,jq2);
1318 qq10 = _mm256_mul_pd(iq1,jq0);
1319 qq11 = _mm256_mul_pd(iq1,jq1);
1320 qq12 = _mm256_mul_pd(iq1,jq2);
1321 qq20 = _mm256_mul_pd(iq2,jq0);
1322 qq21 = _mm256_mul_pd(iq2,jq1);
1323 qq22 = _mm256_mul_pd(iq2,jq2);
1325 /* Avoid stupid compiler warnings */
1326 jnrA = jnrB = jnrC = jnrD = 0;
1327 j_coord_offsetA = 0;
1328 j_coord_offsetB = 0;
1329 j_coord_offsetC = 0;
1330 j_coord_offsetD = 0;
1335 for(iidx=0;iidx<4*DIM;iidx++)
1337 scratch[iidx] = 0.0;
1340 /* Start outer loop over neighborlists */
1341 for(iidx=0; iidx<nri; iidx++)
1343 /* Load shift vector for this list */
1344 i_shift_offset = DIM*shiftidx[iidx];
1346 /* Load limits for loop over neighbors */
1347 j_index_start = jindex[iidx];
1348 j_index_end = jindex[iidx+1];
1350 /* Get outer coordinate index */
1352 i_coord_offset = DIM*inr;
1354 /* Load i particle coords and add shift vector */
1355 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1356 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1358 fix0 = _mm256_setzero_pd();
1359 fiy0 = _mm256_setzero_pd();
1360 fiz0 = _mm256_setzero_pd();
1361 fix1 = _mm256_setzero_pd();
1362 fiy1 = _mm256_setzero_pd();
1363 fiz1 = _mm256_setzero_pd();
1364 fix2 = _mm256_setzero_pd();
1365 fiy2 = _mm256_setzero_pd();
1366 fiz2 = _mm256_setzero_pd();
1368 /* Start inner kernel loop */
1369 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1372 /* Get j neighbor index, and coordinate index */
1374 jnrB = jjnr[jidx+1];
1375 jnrC = jjnr[jidx+2];
1376 jnrD = jjnr[jidx+3];
1377 j_coord_offsetA = DIM*jnrA;
1378 j_coord_offsetB = DIM*jnrB;
1379 j_coord_offsetC = DIM*jnrC;
1380 j_coord_offsetD = DIM*jnrD;
1382 /* load j atom coordinates */
1383 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1384 x+j_coord_offsetC,x+j_coord_offsetD,
1385 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1387 /* Calculate displacement vector */
1388 dx00 = _mm256_sub_pd(ix0,jx0);
1389 dy00 = _mm256_sub_pd(iy0,jy0);
1390 dz00 = _mm256_sub_pd(iz0,jz0);
1391 dx01 = _mm256_sub_pd(ix0,jx1);
1392 dy01 = _mm256_sub_pd(iy0,jy1);
1393 dz01 = _mm256_sub_pd(iz0,jz1);
1394 dx02 = _mm256_sub_pd(ix0,jx2);
1395 dy02 = _mm256_sub_pd(iy0,jy2);
1396 dz02 = _mm256_sub_pd(iz0,jz2);
1397 dx10 = _mm256_sub_pd(ix1,jx0);
1398 dy10 = _mm256_sub_pd(iy1,jy0);
1399 dz10 = _mm256_sub_pd(iz1,jz0);
1400 dx11 = _mm256_sub_pd(ix1,jx1);
1401 dy11 = _mm256_sub_pd(iy1,jy1);
1402 dz11 = _mm256_sub_pd(iz1,jz1);
1403 dx12 = _mm256_sub_pd(ix1,jx2);
1404 dy12 = _mm256_sub_pd(iy1,jy2);
1405 dz12 = _mm256_sub_pd(iz1,jz2);
1406 dx20 = _mm256_sub_pd(ix2,jx0);
1407 dy20 = _mm256_sub_pd(iy2,jy0);
1408 dz20 = _mm256_sub_pd(iz2,jz0);
1409 dx21 = _mm256_sub_pd(ix2,jx1);
1410 dy21 = _mm256_sub_pd(iy2,jy1);
1411 dz21 = _mm256_sub_pd(iz2,jz1);
1412 dx22 = _mm256_sub_pd(ix2,jx2);
1413 dy22 = _mm256_sub_pd(iy2,jy2);
1414 dz22 = _mm256_sub_pd(iz2,jz2);
1416 /* Calculate squared distance and things based on it */
1417 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1418 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1419 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1420 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1421 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1422 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1423 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1424 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1425 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1427 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1428 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1429 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1430 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1431 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1432 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1433 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1434 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1435 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1437 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1438 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1439 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1440 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1441 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1442 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1443 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1444 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1445 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1447 fjx0 = _mm256_setzero_pd();
1448 fjy0 = _mm256_setzero_pd();
1449 fjz0 = _mm256_setzero_pd();
1450 fjx1 = _mm256_setzero_pd();
1451 fjy1 = _mm256_setzero_pd();
1452 fjz1 = _mm256_setzero_pd();
1453 fjx2 = _mm256_setzero_pd();
1454 fjy2 = _mm256_setzero_pd();
1455 fjz2 = _mm256_setzero_pd();
1457 /**************************
1458 * CALCULATE INTERACTIONS *
1459 **************************/
1461 r00 = _mm256_mul_pd(rsq00,rinv00);
1463 /* EWALD ELECTROSTATICS */
1465 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1466 ewrt = _mm256_mul_pd(r00,ewtabscale);
1467 ewitab = _mm256_cvttpd_epi32(ewrt);
1468 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1469 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1470 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1472 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1473 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1477 /* Calculate temporary vectorial force */
1478 tx = _mm256_mul_pd(fscal,dx00);
1479 ty = _mm256_mul_pd(fscal,dy00);
1480 tz = _mm256_mul_pd(fscal,dz00);
1482 /* Update vectorial force */
1483 fix0 = _mm256_add_pd(fix0,tx);
1484 fiy0 = _mm256_add_pd(fiy0,ty);
1485 fiz0 = _mm256_add_pd(fiz0,tz);
1487 fjx0 = _mm256_add_pd(fjx0,tx);
1488 fjy0 = _mm256_add_pd(fjy0,ty);
1489 fjz0 = _mm256_add_pd(fjz0,tz);
1491 /**************************
1492 * CALCULATE INTERACTIONS *
1493 **************************/
1495 r01 = _mm256_mul_pd(rsq01,rinv01);
1497 /* EWALD ELECTROSTATICS */
1499 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1500 ewrt = _mm256_mul_pd(r01,ewtabscale);
1501 ewitab = _mm256_cvttpd_epi32(ewrt);
1502 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1503 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1504 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1506 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1507 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1511 /* Calculate temporary vectorial force */
1512 tx = _mm256_mul_pd(fscal,dx01);
1513 ty = _mm256_mul_pd(fscal,dy01);
1514 tz = _mm256_mul_pd(fscal,dz01);
1516 /* Update vectorial force */
1517 fix0 = _mm256_add_pd(fix0,tx);
1518 fiy0 = _mm256_add_pd(fiy0,ty);
1519 fiz0 = _mm256_add_pd(fiz0,tz);
1521 fjx1 = _mm256_add_pd(fjx1,tx);
1522 fjy1 = _mm256_add_pd(fjy1,ty);
1523 fjz1 = _mm256_add_pd(fjz1,tz);
1525 /**************************
1526 * CALCULATE INTERACTIONS *
1527 **************************/
1529 r02 = _mm256_mul_pd(rsq02,rinv02);
1531 /* EWALD ELECTROSTATICS */
1533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1534 ewrt = _mm256_mul_pd(r02,ewtabscale);
1535 ewitab = _mm256_cvttpd_epi32(ewrt);
1536 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1537 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1538 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1540 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1541 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1545 /* Calculate temporary vectorial force */
1546 tx = _mm256_mul_pd(fscal,dx02);
1547 ty = _mm256_mul_pd(fscal,dy02);
1548 tz = _mm256_mul_pd(fscal,dz02);
1550 /* Update vectorial force */
1551 fix0 = _mm256_add_pd(fix0,tx);
1552 fiy0 = _mm256_add_pd(fiy0,ty);
1553 fiz0 = _mm256_add_pd(fiz0,tz);
1555 fjx2 = _mm256_add_pd(fjx2,tx);
1556 fjy2 = _mm256_add_pd(fjy2,ty);
1557 fjz2 = _mm256_add_pd(fjz2,tz);
1559 /**************************
1560 * CALCULATE INTERACTIONS *
1561 **************************/
1563 r10 = _mm256_mul_pd(rsq10,rinv10);
1565 /* EWALD ELECTROSTATICS */
1567 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1568 ewrt = _mm256_mul_pd(r10,ewtabscale);
1569 ewitab = _mm256_cvttpd_epi32(ewrt);
1570 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1571 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1572 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1574 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1575 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1579 /* Calculate temporary vectorial force */
1580 tx = _mm256_mul_pd(fscal,dx10);
1581 ty = _mm256_mul_pd(fscal,dy10);
1582 tz = _mm256_mul_pd(fscal,dz10);
1584 /* Update vectorial force */
1585 fix1 = _mm256_add_pd(fix1,tx);
1586 fiy1 = _mm256_add_pd(fiy1,ty);
1587 fiz1 = _mm256_add_pd(fiz1,tz);
1589 fjx0 = _mm256_add_pd(fjx0,tx);
1590 fjy0 = _mm256_add_pd(fjy0,ty);
1591 fjz0 = _mm256_add_pd(fjz0,tz);
1593 /**************************
1594 * CALCULATE INTERACTIONS *
1595 **************************/
1597 r11 = _mm256_mul_pd(rsq11,rinv11);
1599 /* EWALD ELECTROSTATICS */
1601 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1602 ewrt = _mm256_mul_pd(r11,ewtabscale);
1603 ewitab = _mm256_cvttpd_epi32(ewrt);
1604 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1605 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1606 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1608 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1609 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
1613 /* Calculate temporary vectorial force */
1614 tx = _mm256_mul_pd(fscal,dx11);
1615 ty = _mm256_mul_pd(fscal,dy11);
1616 tz = _mm256_mul_pd(fscal,dz11);
1618 /* Update vectorial force */
1619 fix1 = _mm256_add_pd(fix1,tx);
1620 fiy1 = _mm256_add_pd(fiy1,ty);
1621 fiz1 = _mm256_add_pd(fiz1,tz);
1623 fjx1 = _mm256_add_pd(fjx1,tx);
1624 fjy1 = _mm256_add_pd(fjy1,ty);
1625 fjz1 = _mm256_add_pd(fjz1,tz);
1627 /**************************
1628 * CALCULATE INTERACTIONS *
1629 **************************/
1631 r12 = _mm256_mul_pd(rsq12,rinv12);
1633 /* EWALD ELECTROSTATICS */
1635 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1636 ewrt = _mm256_mul_pd(r12,ewtabscale);
1637 ewitab = _mm256_cvttpd_epi32(ewrt);
1638 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1639 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1640 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1642 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1643 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
1647 /* Calculate temporary vectorial force */
1648 tx = _mm256_mul_pd(fscal,dx12);
1649 ty = _mm256_mul_pd(fscal,dy12);
1650 tz = _mm256_mul_pd(fscal,dz12);
1652 /* Update vectorial force */
1653 fix1 = _mm256_add_pd(fix1,tx);
1654 fiy1 = _mm256_add_pd(fiy1,ty);
1655 fiz1 = _mm256_add_pd(fiz1,tz);
1657 fjx2 = _mm256_add_pd(fjx2,tx);
1658 fjy2 = _mm256_add_pd(fjy2,ty);
1659 fjz2 = _mm256_add_pd(fjz2,tz);
1661 /**************************
1662 * CALCULATE INTERACTIONS *
1663 **************************/
1665 r20 = _mm256_mul_pd(rsq20,rinv20);
1667 /* EWALD ELECTROSTATICS */
1669 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1670 ewrt = _mm256_mul_pd(r20,ewtabscale);
1671 ewitab = _mm256_cvttpd_epi32(ewrt);
1672 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1673 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1674 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1676 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1677 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1681 /* Calculate temporary vectorial force */
1682 tx = _mm256_mul_pd(fscal,dx20);
1683 ty = _mm256_mul_pd(fscal,dy20);
1684 tz = _mm256_mul_pd(fscal,dz20);
1686 /* Update vectorial force */
1687 fix2 = _mm256_add_pd(fix2,tx);
1688 fiy2 = _mm256_add_pd(fiy2,ty);
1689 fiz2 = _mm256_add_pd(fiz2,tz);
1691 fjx0 = _mm256_add_pd(fjx0,tx);
1692 fjy0 = _mm256_add_pd(fjy0,ty);
1693 fjz0 = _mm256_add_pd(fjz0,tz);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 r21 = _mm256_mul_pd(rsq21,rinv21);
1701 /* EWALD ELECTROSTATICS */
1703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1704 ewrt = _mm256_mul_pd(r21,ewtabscale);
1705 ewitab = _mm256_cvttpd_epi32(ewrt);
1706 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1707 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1708 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1710 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1711 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
1715 /* Calculate temporary vectorial force */
1716 tx = _mm256_mul_pd(fscal,dx21);
1717 ty = _mm256_mul_pd(fscal,dy21);
1718 tz = _mm256_mul_pd(fscal,dz21);
1720 /* Update vectorial force */
1721 fix2 = _mm256_add_pd(fix2,tx);
1722 fiy2 = _mm256_add_pd(fiy2,ty);
1723 fiz2 = _mm256_add_pd(fiz2,tz);
1725 fjx1 = _mm256_add_pd(fjx1,tx);
1726 fjy1 = _mm256_add_pd(fjy1,ty);
1727 fjz1 = _mm256_add_pd(fjz1,tz);
1729 /**************************
1730 * CALCULATE INTERACTIONS *
1731 **************************/
1733 r22 = _mm256_mul_pd(rsq22,rinv22);
1735 /* EWALD ELECTROSTATICS */
1737 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1738 ewrt = _mm256_mul_pd(r22,ewtabscale);
1739 ewitab = _mm256_cvttpd_epi32(ewrt);
1740 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1741 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1742 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1744 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1745 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
1749 /* Calculate temporary vectorial force */
1750 tx = _mm256_mul_pd(fscal,dx22);
1751 ty = _mm256_mul_pd(fscal,dy22);
1752 tz = _mm256_mul_pd(fscal,dz22);
1754 /* Update vectorial force */
1755 fix2 = _mm256_add_pd(fix2,tx);
1756 fiy2 = _mm256_add_pd(fiy2,ty);
1757 fiz2 = _mm256_add_pd(fiz2,tz);
1759 fjx2 = _mm256_add_pd(fjx2,tx);
1760 fjy2 = _mm256_add_pd(fjy2,ty);
1761 fjz2 = _mm256_add_pd(fjz2,tz);
1763 fjptrA = f+j_coord_offsetA;
1764 fjptrB = f+j_coord_offsetB;
1765 fjptrC = f+j_coord_offsetC;
1766 fjptrD = f+j_coord_offsetD;
1768 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1769 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1771 /* Inner loop uses 324 flops */
1774 if(jidx<j_index_end)
1777 /* Get j neighbor index, and coordinate index */
1778 jnrlistA = jjnr[jidx];
1779 jnrlistB = jjnr[jidx+1];
1780 jnrlistC = jjnr[jidx+2];
1781 jnrlistD = jjnr[jidx+3];
1782 /* Sign of each element will be negative for non-real atoms.
1783 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1784 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1786 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1788 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1789 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1790 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1792 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1793 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1794 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1795 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1796 j_coord_offsetA = DIM*jnrA;
1797 j_coord_offsetB = DIM*jnrB;
1798 j_coord_offsetC = DIM*jnrC;
1799 j_coord_offsetD = DIM*jnrD;
1801 /* load j atom coordinates */
1802 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1803 x+j_coord_offsetC,x+j_coord_offsetD,
1804 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1806 /* Calculate displacement vector */
1807 dx00 = _mm256_sub_pd(ix0,jx0);
1808 dy00 = _mm256_sub_pd(iy0,jy0);
1809 dz00 = _mm256_sub_pd(iz0,jz0);
1810 dx01 = _mm256_sub_pd(ix0,jx1);
1811 dy01 = _mm256_sub_pd(iy0,jy1);
1812 dz01 = _mm256_sub_pd(iz0,jz1);
1813 dx02 = _mm256_sub_pd(ix0,jx2);
1814 dy02 = _mm256_sub_pd(iy0,jy2);
1815 dz02 = _mm256_sub_pd(iz0,jz2);
1816 dx10 = _mm256_sub_pd(ix1,jx0);
1817 dy10 = _mm256_sub_pd(iy1,jy0);
1818 dz10 = _mm256_sub_pd(iz1,jz0);
1819 dx11 = _mm256_sub_pd(ix1,jx1);
1820 dy11 = _mm256_sub_pd(iy1,jy1);
1821 dz11 = _mm256_sub_pd(iz1,jz1);
1822 dx12 = _mm256_sub_pd(ix1,jx2);
1823 dy12 = _mm256_sub_pd(iy1,jy2);
1824 dz12 = _mm256_sub_pd(iz1,jz2);
1825 dx20 = _mm256_sub_pd(ix2,jx0);
1826 dy20 = _mm256_sub_pd(iy2,jy0);
1827 dz20 = _mm256_sub_pd(iz2,jz0);
1828 dx21 = _mm256_sub_pd(ix2,jx1);
1829 dy21 = _mm256_sub_pd(iy2,jy1);
1830 dz21 = _mm256_sub_pd(iz2,jz1);
1831 dx22 = _mm256_sub_pd(ix2,jx2);
1832 dy22 = _mm256_sub_pd(iy2,jy2);
1833 dz22 = _mm256_sub_pd(iz2,jz2);
1835 /* Calculate squared distance and things based on it */
1836 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1837 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1838 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1839 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1840 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1841 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1842 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1843 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1844 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1846 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1847 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1848 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1849 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1850 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1851 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1852 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1853 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1854 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1856 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1857 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1858 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1859 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1860 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1861 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1862 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1863 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1864 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1866 fjx0 = _mm256_setzero_pd();
1867 fjy0 = _mm256_setzero_pd();
1868 fjz0 = _mm256_setzero_pd();
1869 fjx1 = _mm256_setzero_pd();
1870 fjy1 = _mm256_setzero_pd();
1871 fjz1 = _mm256_setzero_pd();
1872 fjx2 = _mm256_setzero_pd();
1873 fjy2 = _mm256_setzero_pd();
1874 fjz2 = _mm256_setzero_pd();
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 r00 = _mm256_mul_pd(rsq00,rinv00);
1881 r00 = _mm256_andnot_pd(dummy_mask,r00);
1883 /* EWALD ELECTROSTATICS */
1885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1886 ewrt = _mm256_mul_pd(r00,ewtabscale);
1887 ewitab = _mm256_cvttpd_epi32(ewrt);
1888 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1889 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1890 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1892 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1893 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1897 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1899 /* Calculate temporary vectorial force */
1900 tx = _mm256_mul_pd(fscal,dx00);
1901 ty = _mm256_mul_pd(fscal,dy00);
1902 tz = _mm256_mul_pd(fscal,dz00);
1904 /* Update vectorial force */
1905 fix0 = _mm256_add_pd(fix0,tx);
1906 fiy0 = _mm256_add_pd(fiy0,ty);
1907 fiz0 = _mm256_add_pd(fiz0,tz);
1909 fjx0 = _mm256_add_pd(fjx0,tx);
1910 fjy0 = _mm256_add_pd(fjy0,ty);
1911 fjz0 = _mm256_add_pd(fjz0,tz);
1913 /**************************
1914 * CALCULATE INTERACTIONS *
1915 **************************/
1917 r01 = _mm256_mul_pd(rsq01,rinv01);
1918 r01 = _mm256_andnot_pd(dummy_mask,r01);
1920 /* EWALD ELECTROSTATICS */
1922 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1923 ewrt = _mm256_mul_pd(r01,ewtabscale);
1924 ewitab = _mm256_cvttpd_epi32(ewrt);
1925 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1926 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1927 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1929 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1930 felec = _mm256_mul_pd(_mm256_mul_pd(qq01,rinv01),_mm256_sub_pd(rinvsq01,felec));
1934 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1936 /* Calculate temporary vectorial force */
1937 tx = _mm256_mul_pd(fscal,dx01);
1938 ty = _mm256_mul_pd(fscal,dy01);
1939 tz = _mm256_mul_pd(fscal,dz01);
1941 /* Update vectorial force */
1942 fix0 = _mm256_add_pd(fix0,tx);
1943 fiy0 = _mm256_add_pd(fiy0,ty);
1944 fiz0 = _mm256_add_pd(fiz0,tz);
1946 fjx1 = _mm256_add_pd(fjx1,tx);
1947 fjy1 = _mm256_add_pd(fjy1,ty);
1948 fjz1 = _mm256_add_pd(fjz1,tz);
1950 /**************************
1951 * CALCULATE INTERACTIONS *
1952 **************************/
1954 r02 = _mm256_mul_pd(rsq02,rinv02);
1955 r02 = _mm256_andnot_pd(dummy_mask,r02);
1957 /* EWALD ELECTROSTATICS */
1959 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1960 ewrt = _mm256_mul_pd(r02,ewtabscale);
1961 ewitab = _mm256_cvttpd_epi32(ewrt);
1962 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1963 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1964 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1966 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1967 felec = _mm256_mul_pd(_mm256_mul_pd(qq02,rinv02),_mm256_sub_pd(rinvsq02,felec));
1971 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1973 /* Calculate temporary vectorial force */
1974 tx = _mm256_mul_pd(fscal,dx02);
1975 ty = _mm256_mul_pd(fscal,dy02);
1976 tz = _mm256_mul_pd(fscal,dz02);
1978 /* Update vectorial force */
1979 fix0 = _mm256_add_pd(fix0,tx);
1980 fiy0 = _mm256_add_pd(fiy0,ty);
1981 fiz0 = _mm256_add_pd(fiz0,tz);
1983 fjx2 = _mm256_add_pd(fjx2,tx);
1984 fjy2 = _mm256_add_pd(fjy2,ty);
1985 fjz2 = _mm256_add_pd(fjz2,tz);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r10 = _mm256_mul_pd(rsq10,rinv10);
1992 r10 = _mm256_andnot_pd(dummy_mask,r10);
1994 /* EWALD ELECTROSTATICS */
1996 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1997 ewrt = _mm256_mul_pd(r10,ewtabscale);
1998 ewitab = _mm256_cvttpd_epi32(ewrt);
1999 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2000 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2001 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2003 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2004 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
2008 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2010 /* Calculate temporary vectorial force */
2011 tx = _mm256_mul_pd(fscal,dx10);
2012 ty = _mm256_mul_pd(fscal,dy10);
2013 tz = _mm256_mul_pd(fscal,dz10);
2015 /* Update vectorial force */
2016 fix1 = _mm256_add_pd(fix1,tx);
2017 fiy1 = _mm256_add_pd(fiy1,ty);
2018 fiz1 = _mm256_add_pd(fiz1,tz);
2020 fjx0 = _mm256_add_pd(fjx0,tx);
2021 fjy0 = _mm256_add_pd(fjy0,ty);
2022 fjz0 = _mm256_add_pd(fjz0,tz);
2024 /**************************
2025 * CALCULATE INTERACTIONS *
2026 **************************/
2028 r11 = _mm256_mul_pd(rsq11,rinv11);
2029 r11 = _mm256_andnot_pd(dummy_mask,r11);
2031 /* EWALD ELECTROSTATICS */
2033 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2034 ewrt = _mm256_mul_pd(r11,ewtabscale);
2035 ewitab = _mm256_cvttpd_epi32(ewrt);
2036 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2037 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2038 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2040 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2041 felec = _mm256_mul_pd(_mm256_mul_pd(qq11,rinv11),_mm256_sub_pd(rinvsq11,felec));
2045 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2047 /* Calculate temporary vectorial force */
2048 tx = _mm256_mul_pd(fscal,dx11);
2049 ty = _mm256_mul_pd(fscal,dy11);
2050 tz = _mm256_mul_pd(fscal,dz11);
2052 /* Update vectorial force */
2053 fix1 = _mm256_add_pd(fix1,tx);
2054 fiy1 = _mm256_add_pd(fiy1,ty);
2055 fiz1 = _mm256_add_pd(fiz1,tz);
2057 fjx1 = _mm256_add_pd(fjx1,tx);
2058 fjy1 = _mm256_add_pd(fjy1,ty);
2059 fjz1 = _mm256_add_pd(fjz1,tz);
2061 /**************************
2062 * CALCULATE INTERACTIONS *
2063 **************************/
2065 r12 = _mm256_mul_pd(rsq12,rinv12);
2066 r12 = _mm256_andnot_pd(dummy_mask,r12);
2068 /* EWALD ELECTROSTATICS */
2070 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2071 ewrt = _mm256_mul_pd(r12,ewtabscale);
2072 ewitab = _mm256_cvttpd_epi32(ewrt);
2073 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2074 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2075 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2077 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2078 felec = _mm256_mul_pd(_mm256_mul_pd(qq12,rinv12),_mm256_sub_pd(rinvsq12,felec));
2082 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2084 /* Calculate temporary vectorial force */
2085 tx = _mm256_mul_pd(fscal,dx12);
2086 ty = _mm256_mul_pd(fscal,dy12);
2087 tz = _mm256_mul_pd(fscal,dz12);
2089 /* Update vectorial force */
2090 fix1 = _mm256_add_pd(fix1,tx);
2091 fiy1 = _mm256_add_pd(fiy1,ty);
2092 fiz1 = _mm256_add_pd(fiz1,tz);
2094 fjx2 = _mm256_add_pd(fjx2,tx);
2095 fjy2 = _mm256_add_pd(fjy2,ty);
2096 fjz2 = _mm256_add_pd(fjz2,tz);
2098 /**************************
2099 * CALCULATE INTERACTIONS *
2100 **************************/
2102 r20 = _mm256_mul_pd(rsq20,rinv20);
2103 r20 = _mm256_andnot_pd(dummy_mask,r20);
2105 /* EWALD ELECTROSTATICS */
2107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2108 ewrt = _mm256_mul_pd(r20,ewtabscale);
2109 ewitab = _mm256_cvttpd_epi32(ewrt);
2110 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2111 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2112 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2114 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2115 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
2119 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2121 /* Calculate temporary vectorial force */
2122 tx = _mm256_mul_pd(fscal,dx20);
2123 ty = _mm256_mul_pd(fscal,dy20);
2124 tz = _mm256_mul_pd(fscal,dz20);
2126 /* Update vectorial force */
2127 fix2 = _mm256_add_pd(fix2,tx);
2128 fiy2 = _mm256_add_pd(fiy2,ty);
2129 fiz2 = _mm256_add_pd(fiz2,tz);
2131 fjx0 = _mm256_add_pd(fjx0,tx);
2132 fjy0 = _mm256_add_pd(fjy0,ty);
2133 fjz0 = _mm256_add_pd(fjz0,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 r21 = _mm256_mul_pd(rsq21,rinv21);
2140 r21 = _mm256_andnot_pd(dummy_mask,r21);
2142 /* EWALD ELECTROSTATICS */
2144 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2145 ewrt = _mm256_mul_pd(r21,ewtabscale);
2146 ewitab = _mm256_cvttpd_epi32(ewrt);
2147 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2148 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2149 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2151 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2152 felec = _mm256_mul_pd(_mm256_mul_pd(qq21,rinv21),_mm256_sub_pd(rinvsq21,felec));
2156 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2158 /* Calculate temporary vectorial force */
2159 tx = _mm256_mul_pd(fscal,dx21);
2160 ty = _mm256_mul_pd(fscal,dy21);
2161 tz = _mm256_mul_pd(fscal,dz21);
2163 /* Update vectorial force */
2164 fix2 = _mm256_add_pd(fix2,tx);
2165 fiy2 = _mm256_add_pd(fiy2,ty);
2166 fiz2 = _mm256_add_pd(fiz2,tz);
2168 fjx1 = _mm256_add_pd(fjx1,tx);
2169 fjy1 = _mm256_add_pd(fjy1,ty);
2170 fjz1 = _mm256_add_pd(fjz1,tz);
2172 /**************************
2173 * CALCULATE INTERACTIONS *
2174 **************************/
2176 r22 = _mm256_mul_pd(rsq22,rinv22);
2177 r22 = _mm256_andnot_pd(dummy_mask,r22);
2179 /* EWALD ELECTROSTATICS */
2181 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2182 ewrt = _mm256_mul_pd(r22,ewtabscale);
2183 ewitab = _mm256_cvttpd_epi32(ewrt);
2184 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
2185 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
2186 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
2188 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
2189 felec = _mm256_mul_pd(_mm256_mul_pd(qq22,rinv22),_mm256_sub_pd(rinvsq22,felec));
2193 fscal = _mm256_andnot_pd(dummy_mask,fscal);
2195 /* Calculate temporary vectorial force */
2196 tx = _mm256_mul_pd(fscal,dx22);
2197 ty = _mm256_mul_pd(fscal,dy22);
2198 tz = _mm256_mul_pd(fscal,dz22);
2200 /* Update vectorial force */
2201 fix2 = _mm256_add_pd(fix2,tx);
2202 fiy2 = _mm256_add_pd(fiy2,ty);
2203 fiz2 = _mm256_add_pd(fiz2,tz);
2205 fjx2 = _mm256_add_pd(fjx2,tx);
2206 fjy2 = _mm256_add_pd(fjy2,ty);
2207 fjz2 = _mm256_add_pd(fjz2,tz);
2209 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2210 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2211 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2212 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2214 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
2215 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2217 /* Inner loop uses 333 flops */
2220 /* End of innermost loop */
2222 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2223 f+i_coord_offset,fshift+i_shift_offset);
2225 /* Increment number of inner iterations */
2226 inneriter += j_index_end - j_index_start;
2228 /* Outer loop uses 18 flops */
2231 /* Increment number of outer iterations */
2234 /* Update outer/inner flops */
2236 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*333);