2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: None
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
80 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
81 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
84 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
85 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
87 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
88 real rswitch_scalar,d_scalar;
89 __m256d dummy_mask,cutoff_mask;
90 __m128 tmpmask0,tmpmask1;
91 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
92 __m256d one = _mm256_set1_pd(1.0);
93 __m256d two = _mm256_set1_pd(2.0);
99 jindex = nlist->jindex;
101 shiftidx = nlist->shift;
103 shiftvec = fr->shift_vec[0];
104 fshift = fr->fshift[0];
105 facel = _mm256_set1_pd(fr->epsfac);
106 charge = mdatoms->chargeA;
108 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
109 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
110 beta2 = _mm256_mul_pd(beta,beta);
111 beta3 = _mm256_mul_pd(beta,beta2);
113 ewtab = fr->ic->tabq_coul_FDV0;
114 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
115 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
120 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
121 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
123 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
124 rcutoff_scalar = fr->rcoulomb;
125 rcutoff = _mm256_set1_pd(rcutoff_scalar);
126 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
128 rswitch_scalar = fr->rcoulomb_switch;
129 rswitch = _mm256_set1_pd(rswitch_scalar);
130 /* Setup switch parameters */
131 d_scalar = rcutoff_scalar-rswitch_scalar;
132 d = _mm256_set1_pd(d_scalar);
133 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
134 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
135 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
136 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
137 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
138 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
140 /* Avoid stupid compiler warnings */
141 jnrA = jnrB = jnrC = jnrD = 0;
150 for(iidx=0;iidx<4*DIM;iidx++)
155 /* Start outer loop over neighborlists */
156 for(iidx=0; iidx<nri; iidx++)
158 /* Load shift vector for this list */
159 i_shift_offset = DIM*shiftidx[iidx];
161 /* Load limits for loop over neighbors */
162 j_index_start = jindex[iidx];
163 j_index_end = jindex[iidx+1];
165 /* Get outer coordinate index */
167 i_coord_offset = DIM*inr;
169 /* Load i particle coords and add shift vector */
170 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
171 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
173 fix0 = _mm256_setzero_pd();
174 fiy0 = _mm256_setzero_pd();
175 fiz0 = _mm256_setzero_pd();
176 fix1 = _mm256_setzero_pd();
177 fiy1 = _mm256_setzero_pd();
178 fiz1 = _mm256_setzero_pd();
179 fix2 = _mm256_setzero_pd();
180 fiy2 = _mm256_setzero_pd();
181 fiz2 = _mm256_setzero_pd();
183 /* Reset potential sums */
184 velecsum = _mm256_setzero_pd();
186 /* Start inner kernel loop */
187 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
190 /* Get j neighbor index, and coordinate index */
195 j_coord_offsetA = DIM*jnrA;
196 j_coord_offsetB = DIM*jnrB;
197 j_coord_offsetC = DIM*jnrC;
198 j_coord_offsetD = DIM*jnrD;
200 /* load j atom coordinates */
201 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
202 x+j_coord_offsetC,x+j_coord_offsetD,
205 /* Calculate displacement vector */
206 dx00 = _mm256_sub_pd(ix0,jx0);
207 dy00 = _mm256_sub_pd(iy0,jy0);
208 dz00 = _mm256_sub_pd(iz0,jz0);
209 dx10 = _mm256_sub_pd(ix1,jx0);
210 dy10 = _mm256_sub_pd(iy1,jy0);
211 dz10 = _mm256_sub_pd(iz1,jz0);
212 dx20 = _mm256_sub_pd(ix2,jx0);
213 dy20 = _mm256_sub_pd(iy2,jy0);
214 dz20 = _mm256_sub_pd(iz2,jz0);
216 /* Calculate squared distance and things based on it */
217 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
218 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
219 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
221 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
222 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
223 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
225 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
226 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
227 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
229 /* Load parameters for j particles */
230 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
231 charge+jnrC+0,charge+jnrD+0);
233 fjx0 = _mm256_setzero_pd();
234 fjy0 = _mm256_setzero_pd();
235 fjz0 = _mm256_setzero_pd();
237 /**************************
238 * CALCULATE INTERACTIONS *
239 **************************/
241 if (gmx_mm256_any_lt(rsq00,rcutoff2))
244 r00 = _mm256_mul_pd(rsq00,rinv00);
246 /* Compute parameters for interactions between i and j atoms */
247 qq00 = _mm256_mul_pd(iq0,jq0);
249 /* EWALD ELECTROSTATICS */
251 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
252 ewrt = _mm256_mul_pd(r00,ewtabscale);
253 ewitab = _mm256_cvttpd_epi32(ewrt);
254 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
255 ewitab = _mm_slli_epi32(ewitab,2);
256 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
257 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
258 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
259 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
260 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
261 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
262 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
263 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
264 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
266 d = _mm256_sub_pd(r00,rswitch);
267 d = _mm256_max_pd(d,_mm256_setzero_pd());
268 d2 = _mm256_mul_pd(d,d);
269 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
271 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
273 /* Evaluate switch function */
274 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
275 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
276 velec = _mm256_mul_pd(velec,sw);
277 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
279 /* Update potential sum for this i atom from the interaction with this j atom. */
280 velec = _mm256_and_pd(velec,cutoff_mask);
281 velecsum = _mm256_add_pd(velecsum,velec);
285 fscal = _mm256_and_pd(fscal,cutoff_mask);
287 /* Calculate temporary vectorial force */
288 tx = _mm256_mul_pd(fscal,dx00);
289 ty = _mm256_mul_pd(fscal,dy00);
290 tz = _mm256_mul_pd(fscal,dz00);
292 /* Update vectorial force */
293 fix0 = _mm256_add_pd(fix0,tx);
294 fiy0 = _mm256_add_pd(fiy0,ty);
295 fiz0 = _mm256_add_pd(fiz0,tz);
297 fjx0 = _mm256_add_pd(fjx0,tx);
298 fjy0 = _mm256_add_pd(fjy0,ty);
299 fjz0 = _mm256_add_pd(fjz0,tz);
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 if (gmx_mm256_any_lt(rsq10,rcutoff2))
310 r10 = _mm256_mul_pd(rsq10,rinv10);
312 /* Compute parameters for interactions between i and j atoms */
313 qq10 = _mm256_mul_pd(iq1,jq0);
315 /* EWALD ELECTROSTATICS */
317 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
318 ewrt = _mm256_mul_pd(r10,ewtabscale);
319 ewitab = _mm256_cvttpd_epi32(ewrt);
320 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
321 ewitab = _mm_slli_epi32(ewitab,2);
322 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
323 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
324 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
325 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
326 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
327 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
328 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
329 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
330 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
332 d = _mm256_sub_pd(r10,rswitch);
333 d = _mm256_max_pd(d,_mm256_setzero_pd());
334 d2 = _mm256_mul_pd(d,d);
335 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
337 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
339 /* Evaluate switch function */
340 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
341 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
342 velec = _mm256_mul_pd(velec,sw);
343 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
345 /* Update potential sum for this i atom from the interaction with this j atom. */
346 velec = _mm256_and_pd(velec,cutoff_mask);
347 velecsum = _mm256_add_pd(velecsum,velec);
351 fscal = _mm256_and_pd(fscal,cutoff_mask);
353 /* Calculate temporary vectorial force */
354 tx = _mm256_mul_pd(fscal,dx10);
355 ty = _mm256_mul_pd(fscal,dy10);
356 tz = _mm256_mul_pd(fscal,dz10);
358 /* Update vectorial force */
359 fix1 = _mm256_add_pd(fix1,tx);
360 fiy1 = _mm256_add_pd(fiy1,ty);
361 fiz1 = _mm256_add_pd(fiz1,tz);
363 fjx0 = _mm256_add_pd(fjx0,tx);
364 fjy0 = _mm256_add_pd(fjy0,ty);
365 fjz0 = _mm256_add_pd(fjz0,tz);
369 /**************************
370 * CALCULATE INTERACTIONS *
371 **************************/
373 if (gmx_mm256_any_lt(rsq20,rcutoff2))
376 r20 = _mm256_mul_pd(rsq20,rinv20);
378 /* Compute parameters for interactions between i and j atoms */
379 qq20 = _mm256_mul_pd(iq2,jq0);
381 /* EWALD ELECTROSTATICS */
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm256_mul_pd(r20,ewtabscale);
385 ewitab = _mm256_cvttpd_epi32(ewrt);
386 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
389 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
390 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
391 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
392 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
393 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
394 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
395 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
396 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
398 d = _mm256_sub_pd(r20,rswitch);
399 d = _mm256_max_pd(d,_mm256_setzero_pd());
400 d2 = _mm256_mul_pd(d,d);
401 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
403 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
405 /* Evaluate switch function */
406 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
407 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
408 velec = _mm256_mul_pd(velec,sw);
409 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velec = _mm256_and_pd(velec,cutoff_mask);
413 velecsum = _mm256_add_pd(velecsum,velec);
417 fscal = _mm256_and_pd(fscal,cutoff_mask);
419 /* Calculate temporary vectorial force */
420 tx = _mm256_mul_pd(fscal,dx20);
421 ty = _mm256_mul_pd(fscal,dy20);
422 tz = _mm256_mul_pd(fscal,dz20);
424 /* Update vectorial force */
425 fix2 = _mm256_add_pd(fix2,tx);
426 fiy2 = _mm256_add_pd(fiy2,ty);
427 fiz2 = _mm256_add_pd(fiz2,tz);
429 fjx0 = _mm256_add_pd(fjx0,tx);
430 fjy0 = _mm256_add_pd(fjy0,ty);
431 fjz0 = _mm256_add_pd(fjz0,tz);
435 fjptrA = f+j_coord_offsetA;
436 fjptrB = f+j_coord_offsetB;
437 fjptrC = f+j_coord_offsetC;
438 fjptrD = f+j_coord_offsetD;
440 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
442 /* Inner loop uses 198 flops */
448 /* Get j neighbor index, and coordinate index */
449 jnrlistA = jjnr[jidx];
450 jnrlistB = jjnr[jidx+1];
451 jnrlistC = jjnr[jidx+2];
452 jnrlistD = jjnr[jidx+3];
453 /* Sign of each element will be negative for non-real atoms.
454 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
455 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
457 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
459 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
460 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
461 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
463 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
464 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
465 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
466 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
467 j_coord_offsetA = DIM*jnrA;
468 j_coord_offsetB = DIM*jnrB;
469 j_coord_offsetC = DIM*jnrC;
470 j_coord_offsetD = DIM*jnrD;
472 /* load j atom coordinates */
473 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
474 x+j_coord_offsetC,x+j_coord_offsetD,
477 /* Calculate displacement vector */
478 dx00 = _mm256_sub_pd(ix0,jx0);
479 dy00 = _mm256_sub_pd(iy0,jy0);
480 dz00 = _mm256_sub_pd(iz0,jz0);
481 dx10 = _mm256_sub_pd(ix1,jx0);
482 dy10 = _mm256_sub_pd(iy1,jy0);
483 dz10 = _mm256_sub_pd(iz1,jz0);
484 dx20 = _mm256_sub_pd(ix2,jx0);
485 dy20 = _mm256_sub_pd(iy2,jy0);
486 dz20 = _mm256_sub_pd(iz2,jz0);
488 /* Calculate squared distance and things based on it */
489 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
490 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
491 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
493 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
494 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
495 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
497 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
498 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
499 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
501 /* Load parameters for j particles */
502 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
503 charge+jnrC+0,charge+jnrD+0);
505 fjx0 = _mm256_setzero_pd();
506 fjy0 = _mm256_setzero_pd();
507 fjz0 = _mm256_setzero_pd();
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm256_any_lt(rsq00,rcutoff2))
516 r00 = _mm256_mul_pd(rsq00,rinv00);
517 r00 = _mm256_andnot_pd(dummy_mask,r00);
519 /* Compute parameters for interactions between i and j atoms */
520 qq00 = _mm256_mul_pd(iq0,jq0);
522 /* EWALD ELECTROSTATICS */
524 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525 ewrt = _mm256_mul_pd(r00,ewtabscale);
526 ewitab = _mm256_cvttpd_epi32(ewrt);
527 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
528 ewitab = _mm_slli_epi32(ewitab,2);
529 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
530 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
531 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
532 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
533 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
534 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
535 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
536 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
537 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
539 d = _mm256_sub_pd(r00,rswitch);
540 d = _mm256_max_pd(d,_mm256_setzero_pd());
541 d2 = _mm256_mul_pd(d,d);
542 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
544 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
546 /* Evaluate switch function */
547 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
548 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
549 velec = _mm256_mul_pd(velec,sw);
550 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
552 /* Update potential sum for this i atom from the interaction with this j atom. */
553 velec = _mm256_and_pd(velec,cutoff_mask);
554 velec = _mm256_andnot_pd(dummy_mask,velec);
555 velecsum = _mm256_add_pd(velecsum,velec);
559 fscal = _mm256_and_pd(fscal,cutoff_mask);
561 fscal = _mm256_andnot_pd(dummy_mask,fscal);
563 /* Calculate temporary vectorial force */
564 tx = _mm256_mul_pd(fscal,dx00);
565 ty = _mm256_mul_pd(fscal,dy00);
566 tz = _mm256_mul_pd(fscal,dz00);
568 /* Update vectorial force */
569 fix0 = _mm256_add_pd(fix0,tx);
570 fiy0 = _mm256_add_pd(fiy0,ty);
571 fiz0 = _mm256_add_pd(fiz0,tz);
573 fjx0 = _mm256_add_pd(fjx0,tx);
574 fjy0 = _mm256_add_pd(fjy0,ty);
575 fjz0 = _mm256_add_pd(fjz0,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 if (gmx_mm256_any_lt(rsq10,rcutoff2))
586 r10 = _mm256_mul_pd(rsq10,rinv10);
587 r10 = _mm256_andnot_pd(dummy_mask,r10);
589 /* Compute parameters for interactions between i and j atoms */
590 qq10 = _mm256_mul_pd(iq1,jq0);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm256_mul_pd(r10,ewtabscale);
596 ewitab = _mm256_cvttpd_epi32(ewrt);
597 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
603 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
605 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
606 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
607 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
609 d = _mm256_sub_pd(r10,rswitch);
610 d = _mm256_max_pd(d,_mm256_setzero_pd());
611 d2 = _mm256_mul_pd(d,d);
612 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
614 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
616 /* Evaluate switch function */
617 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
618 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
619 velec = _mm256_mul_pd(velec,sw);
620 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec = _mm256_and_pd(velec,cutoff_mask);
624 velec = _mm256_andnot_pd(dummy_mask,velec);
625 velecsum = _mm256_add_pd(velecsum,velec);
629 fscal = _mm256_and_pd(fscal,cutoff_mask);
631 fscal = _mm256_andnot_pd(dummy_mask,fscal);
633 /* Calculate temporary vectorial force */
634 tx = _mm256_mul_pd(fscal,dx10);
635 ty = _mm256_mul_pd(fscal,dy10);
636 tz = _mm256_mul_pd(fscal,dz10);
638 /* Update vectorial force */
639 fix1 = _mm256_add_pd(fix1,tx);
640 fiy1 = _mm256_add_pd(fiy1,ty);
641 fiz1 = _mm256_add_pd(fiz1,tz);
643 fjx0 = _mm256_add_pd(fjx0,tx);
644 fjy0 = _mm256_add_pd(fjy0,ty);
645 fjz0 = _mm256_add_pd(fjz0,tz);
649 /**************************
650 * CALCULATE INTERACTIONS *
651 **************************/
653 if (gmx_mm256_any_lt(rsq20,rcutoff2))
656 r20 = _mm256_mul_pd(rsq20,rinv20);
657 r20 = _mm256_andnot_pd(dummy_mask,r20);
659 /* Compute parameters for interactions between i and j atoms */
660 qq20 = _mm256_mul_pd(iq2,jq0);
662 /* EWALD ELECTROSTATICS */
664 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
665 ewrt = _mm256_mul_pd(r20,ewtabscale);
666 ewitab = _mm256_cvttpd_epi32(ewrt);
667 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
668 ewitab = _mm_slli_epi32(ewitab,2);
669 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
670 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
671 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
672 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
673 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
674 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
675 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
676 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
677 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
679 d = _mm256_sub_pd(r20,rswitch);
680 d = _mm256_max_pd(d,_mm256_setzero_pd());
681 d2 = _mm256_mul_pd(d,d);
682 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
684 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
686 /* Evaluate switch function */
687 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
688 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
689 velec = _mm256_mul_pd(velec,sw);
690 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velec = _mm256_and_pd(velec,cutoff_mask);
694 velec = _mm256_andnot_pd(dummy_mask,velec);
695 velecsum = _mm256_add_pd(velecsum,velec);
699 fscal = _mm256_and_pd(fscal,cutoff_mask);
701 fscal = _mm256_andnot_pd(dummy_mask,fscal);
703 /* Calculate temporary vectorial force */
704 tx = _mm256_mul_pd(fscal,dx20);
705 ty = _mm256_mul_pd(fscal,dy20);
706 tz = _mm256_mul_pd(fscal,dz20);
708 /* Update vectorial force */
709 fix2 = _mm256_add_pd(fix2,tx);
710 fiy2 = _mm256_add_pd(fiy2,ty);
711 fiz2 = _mm256_add_pd(fiz2,tz);
713 fjx0 = _mm256_add_pd(fjx0,tx);
714 fjy0 = _mm256_add_pd(fjy0,ty);
715 fjz0 = _mm256_add_pd(fjz0,tz);
719 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
720 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
721 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
722 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
724 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
726 /* Inner loop uses 201 flops */
729 /* End of innermost loop */
731 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
732 f+i_coord_offset,fshift+i_shift_offset);
735 /* Update potential energies */
736 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
738 /* Increment number of inner iterations */
739 inneriter += j_index_end - j_index_start;
741 /* Outer loop uses 19 flops */
744 /* Increment number of outer iterations */
747 /* Update outer/inner flops */
749 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_VF,outeriter*19 + inneriter*201);
752 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double
753 * Electrostatics interaction: Ewald
754 * VdW interaction: None
755 * Geometry: Water3-Particle
756 * Calculate force/pot: Force
759 nb_kernel_ElecEwSw_VdwNone_GeomW3P1_F_avx_256_double
760 (t_nblist * gmx_restrict nlist,
761 rvec * gmx_restrict xx,
762 rvec * gmx_restrict ff,
763 t_forcerec * gmx_restrict fr,
764 t_mdatoms * gmx_restrict mdatoms,
765 nb_kernel_data_t * gmx_restrict kernel_data,
766 t_nrnb * gmx_restrict nrnb)
768 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
769 * just 0 for non-waters.
770 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
771 * jnr indices corresponding to data put in the four positions in the SIMD register.
773 int i_shift_offset,i_coord_offset,outeriter,inneriter;
774 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
775 int jnrA,jnrB,jnrC,jnrD;
776 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
777 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
778 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
779 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
781 real *shiftvec,*fshift,*x,*f;
782 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
784 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
785 real * vdwioffsetptr0;
786 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
787 real * vdwioffsetptr1;
788 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
789 real * vdwioffsetptr2;
790 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
791 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
792 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
793 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
794 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
795 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
796 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
799 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
800 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
802 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
803 real rswitch_scalar,d_scalar;
804 __m256d dummy_mask,cutoff_mask;
805 __m128 tmpmask0,tmpmask1;
806 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
807 __m256d one = _mm256_set1_pd(1.0);
808 __m256d two = _mm256_set1_pd(2.0);
814 jindex = nlist->jindex;
816 shiftidx = nlist->shift;
818 shiftvec = fr->shift_vec[0];
819 fshift = fr->fshift[0];
820 facel = _mm256_set1_pd(fr->epsfac);
821 charge = mdatoms->chargeA;
823 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
824 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
825 beta2 = _mm256_mul_pd(beta,beta);
826 beta3 = _mm256_mul_pd(beta,beta2);
828 ewtab = fr->ic->tabq_coul_FDV0;
829 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
830 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
832 /* Setup water-specific parameters */
833 inr = nlist->iinr[0];
834 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
835 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
836 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
838 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
839 rcutoff_scalar = fr->rcoulomb;
840 rcutoff = _mm256_set1_pd(rcutoff_scalar);
841 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
843 rswitch_scalar = fr->rcoulomb_switch;
844 rswitch = _mm256_set1_pd(rswitch_scalar);
845 /* Setup switch parameters */
846 d_scalar = rcutoff_scalar-rswitch_scalar;
847 d = _mm256_set1_pd(d_scalar);
848 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
849 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
850 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
851 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
852 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
853 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
855 /* Avoid stupid compiler warnings */
856 jnrA = jnrB = jnrC = jnrD = 0;
865 for(iidx=0;iidx<4*DIM;iidx++)
870 /* Start outer loop over neighborlists */
871 for(iidx=0; iidx<nri; iidx++)
873 /* Load shift vector for this list */
874 i_shift_offset = DIM*shiftidx[iidx];
876 /* Load limits for loop over neighbors */
877 j_index_start = jindex[iidx];
878 j_index_end = jindex[iidx+1];
880 /* Get outer coordinate index */
882 i_coord_offset = DIM*inr;
884 /* Load i particle coords and add shift vector */
885 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
886 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
888 fix0 = _mm256_setzero_pd();
889 fiy0 = _mm256_setzero_pd();
890 fiz0 = _mm256_setzero_pd();
891 fix1 = _mm256_setzero_pd();
892 fiy1 = _mm256_setzero_pd();
893 fiz1 = _mm256_setzero_pd();
894 fix2 = _mm256_setzero_pd();
895 fiy2 = _mm256_setzero_pd();
896 fiz2 = _mm256_setzero_pd();
898 /* Start inner kernel loop */
899 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
902 /* Get j neighbor index, and coordinate index */
907 j_coord_offsetA = DIM*jnrA;
908 j_coord_offsetB = DIM*jnrB;
909 j_coord_offsetC = DIM*jnrC;
910 j_coord_offsetD = DIM*jnrD;
912 /* load j atom coordinates */
913 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
914 x+j_coord_offsetC,x+j_coord_offsetD,
917 /* Calculate displacement vector */
918 dx00 = _mm256_sub_pd(ix0,jx0);
919 dy00 = _mm256_sub_pd(iy0,jy0);
920 dz00 = _mm256_sub_pd(iz0,jz0);
921 dx10 = _mm256_sub_pd(ix1,jx0);
922 dy10 = _mm256_sub_pd(iy1,jy0);
923 dz10 = _mm256_sub_pd(iz1,jz0);
924 dx20 = _mm256_sub_pd(ix2,jx0);
925 dy20 = _mm256_sub_pd(iy2,jy0);
926 dz20 = _mm256_sub_pd(iz2,jz0);
928 /* Calculate squared distance and things based on it */
929 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
930 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
931 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
933 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
934 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
935 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
937 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
938 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
939 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
941 /* Load parameters for j particles */
942 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
943 charge+jnrC+0,charge+jnrD+0);
945 fjx0 = _mm256_setzero_pd();
946 fjy0 = _mm256_setzero_pd();
947 fjz0 = _mm256_setzero_pd();
949 /**************************
950 * CALCULATE INTERACTIONS *
951 **************************/
953 if (gmx_mm256_any_lt(rsq00,rcutoff2))
956 r00 = _mm256_mul_pd(rsq00,rinv00);
958 /* Compute parameters for interactions between i and j atoms */
959 qq00 = _mm256_mul_pd(iq0,jq0);
961 /* EWALD ELECTROSTATICS */
963 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
964 ewrt = _mm256_mul_pd(r00,ewtabscale);
965 ewitab = _mm256_cvttpd_epi32(ewrt);
966 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
967 ewitab = _mm_slli_epi32(ewitab,2);
968 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
969 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
970 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
971 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
972 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
973 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
974 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
975 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
976 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
978 d = _mm256_sub_pd(r00,rswitch);
979 d = _mm256_max_pd(d,_mm256_setzero_pd());
980 d2 = _mm256_mul_pd(d,d);
981 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
983 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
985 /* Evaluate switch function */
986 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
987 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
988 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
992 fscal = _mm256_and_pd(fscal,cutoff_mask);
994 /* Calculate temporary vectorial force */
995 tx = _mm256_mul_pd(fscal,dx00);
996 ty = _mm256_mul_pd(fscal,dy00);
997 tz = _mm256_mul_pd(fscal,dz00);
999 /* Update vectorial force */
1000 fix0 = _mm256_add_pd(fix0,tx);
1001 fiy0 = _mm256_add_pd(fiy0,ty);
1002 fiz0 = _mm256_add_pd(fiz0,tz);
1004 fjx0 = _mm256_add_pd(fjx0,tx);
1005 fjy0 = _mm256_add_pd(fjy0,ty);
1006 fjz0 = _mm256_add_pd(fjz0,tz);
1010 /**************************
1011 * CALCULATE INTERACTIONS *
1012 **************************/
1014 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1017 r10 = _mm256_mul_pd(rsq10,rinv10);
1019 /* Compute parameters for interactions between i and j atoms */
1020 qq10 = _mm256_mul_pd(iq1,jq0);
1022 /* EWALD ELECTROSTATICS */
1024 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1025 ewrt = _mm256_mul_pd(r10,ewtabscale);
1026 ewitab = _mm256_cvttpd_epi32(ewrt);
1027 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1028 ewitab = _mm_slli_epi32(ewitab,2);
1029 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1030 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1031 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1032 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1033 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1034 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1035 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1036 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1037 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1039 d = _mm256_sub_pd(r10,rswitch);
1040 d = _mm256_max_pd(d,_mm256_setzero_pd());
1041 d2 = _mm256_mul_pd(d,d);
1042 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1044 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1046 /* Evaluate switch function */
1047 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1048 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1049 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1053 fscal = _mm256_and_pd(fscal,cutoff_mask);
1055 /* Calculate temporary vectorial force */
1056 tx = _mm256_mul_pd(fscal,dx10);
1057 ty = _mm256_mul_pd(fscal,dy10);
1058 tz = _mm256_mul_pd(fscal,dz10);
1060 /* Update vectorial force */
1061 fix1 = _mm256_add_pd(fix1,tx);
1062 fiy1 = _mm256_add_pd(fiy1,ty);
1063 fiz1 = _mm256_add_pd(fiz1,tz);
1065 fjx0 = _mm256_add_pd(fjx0,tx);
1066 fjy0 = _mm256_add_pd(fjy0,ty);
1067 fjz0 = _mm256_add_pd(fjz0,tz);
1071 /**************************
1072 * CALCULATE INTERACTIONS *
1073 **************************/
1075 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1078 r20 = _mm256_mul_pd(rsq20,rinv20);
1080 /* Compute parameters for interactions between i and j atoms */
1081 qq20 = _mm256_mul_pd(iq2,jq0);
1083 /* EWALD ELECTROSTATICS */
1085 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1086 ewrt = _mm256_mul_pd(r20,ewtabscale);
1087 ewitab = _mm256_cvttpd_epi32(ewrt);
1088 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1089 ewitab = _mm_slli_epi32(ewitab,2);
1090 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1091 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1092 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1093 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1094 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1095 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1096 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1097 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1098 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1100 d = _mm256_sub_pd(r20,rswitch);
1101 d = _mm256_max_pd(d,_mm256_setzero_pd());
1102 d2 = _mm256_mul_pd(d,d);
1103 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1105 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1107 /* Evaluate switch function */
1108 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1109 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1110 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1114 fscal = _mm256_and_pd(fscal,cutoff_mask);
1116 /* Calculate temporary vectorial force */
1117 tx = _mm256_mul_pd(fscal,dx20);
1118 ty = _mm256_mul_pd(fscal,dy20);
1119 tz = _mm256_mul_pd(fscal,dz20);
1121 /* Update vectorial force */
1122 fix2 = _mm256_add_pd(fix2,tx);
1123 fiy2 = _mm256_add_pd(fiy2,ty);
1124 fiz2 = _mm256_add_pd(fiz2,tz);
1126 fjx0 = _mm256_add_pd(fjx0,tx);
1127 fjy0 = _mm256_add_pd(fjy0,ty);
1128 fjz0 = _mm256_add_pd(fjz0,tz);
1132 fjptrA = f+j_coord_offsetA;
1133 fjptrB = f+j_coord_offsetB;
1134 fjptrC = f+j_coord_offsetC;
1135 fjptrD = f+j_coord_offsetD;
1137 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1139 /* Inner loop uses 189 flops */
1142 if(jidx<j_index_end)
1145 /* Get j neighbor index, and coordinate index */
1146 jnrlistA = jjnr[jidx];
1147 jnrlistB = jjnr[jidx+1];
1148 jnrlistC = jjnr[jidx+2];
1149 jnrlistD = jjnr[jidx+3];
1150 /* Sign of each element will be negative for non-real atoms.
1151 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1152 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1154 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1156 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1157 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1158 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1160 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1161 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1162 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1163 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1164 j_coord_offsetA = DIM*jnrA;
1165 j_coord_offsetB = DIM*jnrB;
1166 j_coord_offsetC = DIM*jnrC;
1167 j_coord_offsetD = DIM*jnrD;
1169 /* load j atom coordinates */
1170 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1171 x+j_coord_offsetC,x+j_coord_offsetD,
1174 /* Calculate displacement vector */
1175 dx00 = _mm256_sub_pd(ix0,jx0);
1176 dy00 = _mm256_sub_pd(iy0,jy0);
1177 dz00 = _mm256_sub_pd(iz0,jz0);
1178 dx10 = _mm256_sub_pd(ix1,jx0);
1179 dy10 = _mm256_sub_pd(iy1,jy0);
1180 dz10 = _mm256_sub_pd(iz1,jz0);
1181 dx20 = _mm256_sub_pd(ix2,jx0);
1182 dy20 = _mm256_sub_pd(iy2,jy0);
1183 dz20 = _mm256_sub_pd(iz2,jz0);
1185 /* Calculate squared distance and things based on it */
1186 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1187 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1188 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1190 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1191 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1192 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1194 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1195 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1196 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1198 /* Load parameters for j particles */
1199 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1200 charge+jnrC+0,charge+jnrD+0);
1202 fjx0 = _mm256_setzero_pd();
1203 fjy0 = _mm256_setzero_pd();
1204 fjz0 = _mm256_setzero_pd();
1206 /**************************
1207 * CALCULATE INTERACTIONS *
1208 **************************/
1210 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1213 r00 = _mm256_mul_pd(rsq00,rinv00);
1214 r00 = _mm256_andnot_pd(dummy_mask,r00);
1216 /* Compute parameters for interactions between i and j atoms */
1217 qq00 = _mm256_mul_pd(iq0,jq0);
1219 /* EWALD ELECTROSTATICS */
1221 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1222 ewrt = _mm256_mul_pd(r00,ewtabscale);
1223 ewitab = _mm256_cvttpd_epi32(ewrt);
1224 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1225 ewitab = _mm_slli_epi32(ewitab,2);
1226 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1227 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1228 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1229 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1230 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1231 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1232 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1233 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1234 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1236 d = _mm256_sub_pd(r00,rswitch);
1237 d = _mm256_max_pd(d,_mm256_setzero_pd());
1238 d2 = _mm256_mul_pd(d,d);
1239 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1241 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1243 /* Evaluate switch function */
1244 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1245 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1246 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1250 fscal = _mm256_and_pd(fscal,cutoff_mask);
1252 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1254 /* Calculate temporary vectorial force */
1255 tx = _mm256_mul_pd(fscal,dx00);
1256 ty = _mm256_mul_pd(fscal,dy00);
1257 tz = _mm256_mul_pd(fscal,dz00);
1259 /* Update vectorial force */
1260 fix0 = _mm256_add_pd(fix0,tx);
1261 fiy0 = _mm256_add_pd(fiy0,ty);
1262 fiz0 = _mm256_add_pd(fiz0,tz);
1264 fjx0 = _mm256_add_pd(fjx0,tx);
1265 fjy0 = _mm256_add_pd(fjy0,ty);
1266 fjz0 = _mm256_add_pd(fjz0,tz);
1270 /**************************
1271 * CALCULATE INTERACTIONS *
1272 **************************/
1274 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1277 r10 = _mm256_mul_pd(rsq10,rinv10);
1278 r10 = _mm256_andnot_pd(dummy_mask,r10);
1280 /* Compute parameters for interactions between i and j atoms */
1281 qq10 = _mm256_mul_pd(iq1,jq0);
1283 /* EWALD ELECTROSTATICS */
1285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1286 ewrt = _mm256_mul_pd(r10,ewtabscale);
1287 ewitab = _mm256_cvttpd_epi32(ewrt);
1288 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1289 ewitab = _mm_slli_epi32(ewitab,2);
1290 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1291 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1292 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1293 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1294 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1295 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1296 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1297 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1298 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1300 d = _mm256_sub_pd(r10,rswitch);
1301 d = _mm256_max_pd(d,_mm256_setzero_pd());
1302 d2 = _mm256_mul_pd(d,d);
1303 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1305 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1307 /* Evaluate switch function */
1308 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1309 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1310 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1314 fscal = _mm256_and_pd(fscal,cutoff_mask);
1316 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1318 /* Calculate temporary vectorial force */
1319 tx = _mm256_mul_pd(fscal,dx10);
1320 ty = _mm256_mul_pd(fscal,dy10);
1321 tz = _mm256_mul_pd(fscal,dz10);
1323 /* Update vectorial force */
1324 fix1 = _mm256_add_pd(fix1,tx);
1325 fiy1 = _mm256_add_pd(fiy1,ty);
1326 fiz1 = _mm256_add_pd(fiz1,tz);
1328 fjx0 = _mm256_add_pd(fjx0,tx);
1329 fjy0 = _mm256_add_pd(fjy0,ty);
1330 fjz0 = _mm256_add_pd(fjz0,tz);
1334 /**************************
1335 * CALCULATE INTERACTIONS *
1336 **************************/
1338 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1341 r20 = _mm256_mul_pd(rsq20,rinv20);
1342 r20 = _mm256_andnot_pd(dummy_mask,r20);
1344 /* Compute parameters for interactions between i and j atoms */
1345 qq20 = _mm256_mul_pd(iq2,jq0);
1347 /* EWALD ELECTROSTATICS */
1349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1350 ewrt = _mm256_mul_pd(r20,ewtabscale);
1351 ewitab = _mm256_cvttpd_epi32(ewrt);
1352 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1353 ewitab = _mm_slli_epi32(ewitab,2);
1354 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1355 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1356 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1357 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1358 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1359 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1360 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1361 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1362 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1364 d = _mm256_sub_pd(r20,rswitch);
1365 d = _mm256_max_pd(d,_mm256_setzero_pd());
1366 d2 = _mm256_mul_pd(d,d);
1367 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1369 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1371 /* Evaluate switch function */
1372 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1373 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1374 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1378 fscal = _mm256_and_pd(fscal,cutoff_mask);
1380 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1382 /* Calculate temporary vectorial force */
1383 tx = _mm256_mul_pd(fscal,dx20);
1384 ty = _mm256_mul_pd(fscal,dy20);
1385 tz = _mm256_mul_pd(fscal,dz20);
1387 /* Update vectorial force */
1388 fix2 = _mm256_add_pd(fix2,tx);
1389 fiy2 = _mm256_add_pd(fiy2,ty);
1390 fiz2 = _mm256_add_pd(fiz2,tz);
1392 fjx0 = _mm256_add_pd(fjx0,tx);
1393 fjy0 = _mm256_add_pd(fjy0,ty);
1394 fjz0 = _mm256_add_pd(fjz0,tz);
1398 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1399 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1400 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1401 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1403 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1405 /* Inner loop uses 192 flops */
1408 /* End of innermost loop */
1410 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1411 f+i_coord_offset,fshift+i_shift_offset);
1413 /* Increment number of inner iterations */
1414 inneriter += j_index_end - j_index_start;
1416 /* Outer loop uses 18 flops */
1419 /* Increment number of outer iterations */
1422 /* Update outer/inner flops */
1424 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3_F,outeriter*18 + inneriter*192);