2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B;
77 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B;
79 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B;
81 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
99 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
101 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
103 __m128d dummy_mask,cutoff_mask;
104 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
105 __m128d one = _mm_set1_pd(1.0);
106 __m128d two = _mm_set1_pd(2.0);
112 jindex = nlist->jindex;
114 shiftidx = nlist->shift;
116 shiftvec = fr->shift_vec[0];
117 fshift = fr->fshift[0];
118 facel = _mm_set1_pd(fr->epsfac);
119 charge = mdatoms->chargeA;
120 nvdwtype = fr->ntype;
122 vdwtype = mdatoms->typeA;
124 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
125 ewtab = fr->ic->tabq_coul_FDV0;
126 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
127 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
129 /* Setup water-specific parameters */
130 inr = nlist->iinr[0];
131 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
132 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
133 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
134 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
136 jq1 = _mm_set1_pd(charge[inr+1]);
137 jq2 = _mm_set1_pd(charge[inr+2]);
138 jq3 = _mm_set1_pd(charge[inr+3]);
139 vdwjidx0A = 2*vdwtype[inr+0];
140 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
141 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
142 qq11 = _mm_mul_pd(iq1,jq1);
143 qq12 = _mm_mul_pd(iq1,jq2);
144 qq13 = _mm_mul_pd(iq1,jq3);
145 qq21 = _mm_mul_pd(iq2,jq1);
146 qq22 = _mm_mul_pd(iq2,jq2);
147 qq23 = _mm_mul_pd(iq2,jq3);
148 qq31 = _mm_mul_pd(iq3,jq1);
149 qq32 = _mm_mul_pd(iq3,jq2);
150 qq33 = _mm_mul_pd(iq3,jq3);
152 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153 rcutoff_scalar = fr->rcoulomb;
154 rcutoff = _mm_set1_pd(rcutoff_scalar);
155 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
157 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
158 rvdw = _mm_set1_pd(fr->rvdw);
160 /* Avoid stupid compiler warnings */
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
186 fix0 = _mm_setzero_pd();
187 fiy0 = _mm_setzero_pd();
188 fiz0 = _mm_setzero_pd();
189 fix1 = _mm_setzero_pd();
190 fiy1 = _mm_setzero_pd();
191 fiz1 = _mm_setzero_pd();
192 fix2 = _mm_setzero_pd();
193 fiy2 = _mm_setzero_pd();
194 fiz2 = _mm_setzero_pd();
195 fix3 = _mm_setzero_pd();
196 fiy3 = _mm_setzero_pd();
197 fiz3 = _mm_setzero_pd();
199 /* Reset potential sums */
200 velecsum = _mm_setzero_pd();
201 vvdwsum = _mm_setzero_pd();
203 /* Start inner kernel loop */
204 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
207 /* Get j neighbor index, and coordinate index */
210 j_coord_offsetA = DIM*jnrA;
211 j_coord_offsetB = DIM*jnrB;
213 /* load j atom coordinates */
214 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
216 &jy2,&jz2,&jx3,&jy3,&jz3);
218 /* Calculate displacement vector */
219 dx00 = _mm_sub_pd(ix0,jx0);
220 dy00 = _mm_sub_pd(iy0,jy0);
221 dz00 = _mm_sub_pd(iz0,jz0);
222 dx11 = _mm_sub_pd(ix1,jx1);
223 dy11 = _mm_sub_pd(iy1,jy1);
224 dz11 = _mm_sub_pd(iz1,jz1);
225 dx12 = _mm_sub_pd(ix1,jx2);
226 dy12 = _mm_sub_pd(iy1,jy2);
227 dz12 = _mm_sub_pd(iz1,jz2);
228 dx13 = _mm_sub_pd(ix1,jx3);
229 dy13 = _mm_sub_pd(iy1,jy3);
230 dz13 = _mm_sub_pd(iz1,jz3);
231 dx21 = _mm_sub_pd(ix2,jx1);
232 dy21 = _mm_sub_pd(iy2,jy1);
233 dz21 = _mm_sub_pd(iz2,jz1);
234 dx22 = _mm_sub_pd(ix2,jx2);
235 dy22 = _mm_sub_pd(iy2,jy2);
236 dz22 = _mm_sub_pd(iz2,jz2);
237 dx23 = _mm_sub_pd(ix2,jx3);
238 dy23 = _mm_sub_pd(iy2,jy3);
239 dz23 = _mm_sub_pd(iz2,jz3);
240 dx31 = _mm_sub_pd(ix3,jx1);
241 dy31 = _mm_sub_pd(iy3,jy1);
242 dz31 = _mm_sub_pd(iz3,jz1);
243 dx32 = _mm_sub_pd(ix3,jx2);
244 dy32 = _mm_sub_pd(iy3,jy2);
245 dz32 = _mm_sub_pd(iz3,jz2);
246 dx33 = _mm_sub_pd(ix3,jx3);
247 dy33 = _mm_sub_pd(iy3,jy3);
248 dz33 = _mm_sub_pd(iz3,jz3);
250 /* Calculate squared distance and things based on it */
251 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
252 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
253 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
254 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
255 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
256 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
257 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
258 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
259 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
260 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
262 rinv11 = gmx_mm_invsqrt_pd(rsq11);
263 rinv12 = gmx_mm_invsqrt_pd(rsq12);
264 rinv13 = gmx_mm_invsqrt_pd(rsq13);
265 rinv21 = gmx_mm_invsqrt_pd(rsq21);
266 rinv22 = gmx_mm_invsqrt_pd(rsq22);
267 rinv23 = gmx_mm_invsqrt_pd(rsq23);
268 rinv31 = gmx_mm_invsqrt_pd(rsq31);
269 rinv32 = gmx_mm_invsqrt_pd(rsq32);
270 rinv33 = gmx_mm_invsqrt_pd(rsq33);
272 rinvsq00 = gmx_mm_inv_pd(rsq00);
273 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
274 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
275 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
276 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
277 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
278 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
279 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
280 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
281 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
283 fjx0 = _mm_setzero_pd();
284 fjy0 = _mm_setzero_pd();
285 fjz0 = _mm_setzero_pd();
286 fjx1 = _mm_setzero_pd();
287 fjy1 = _mm_setzero_pd();
288 fjz1 = _mm_setzero_pd();
289 fjx2 = _mm_setzero_pd();
290 fjy2 = _mm_setzero_pd();
291 fjz2 = _mm_setzero_pd();
292 fjx3 = _mm_setzero_pd();
293 fjy3 = _mm_setzero_pd();
294 fjz3 = _mm_setzero_pd();
296 /**************************
297 * CALCULATE INTERACTIONS *
298 **************************/
300 if (gmx_mm_any_lt(rsq00,rcutoff2))
303 /* LENNARD-JONES DISPERSION/REPULSION */
305 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
306 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
307 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
308 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
309 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
310 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
312 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
314 /* Update potential sum for this i atom from the interaction with this j atom. */
315 vvdw = _mm_and_pd(vvdw,cutoff_mask);
316 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
320 fscal = _mm_and_pd(fscal,cutoff_mask);
322 /* Update vectorial force */
323 fix0 = _mm_macc_pd(dx00,fscal,fix0);
324 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
325 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
327 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
328 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
329 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
333 /**************************
334 * CALCULATE INTERACTIONS *
335 **************************/
337 if (gmx_mm_any_lt(rsq11,rcutoff2))
340 r11 = _mm_mul_pd(rsq11,rinv11);
342 /* EWALD ELECTROSTATICS */
344 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
345 ewrt = _mm_mul_pd(r11,ewtabscale);
346 ewitab = _mm_cvttpd_epi32(ewrt);
348 eweps = _mm_frcz_pd(ewrt);
350 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
352 twoeweps = _mm_add_pd(eweps,eweps);
353 ewitab = _mm_slli_epi32(ewitab,2);
354 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
355 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
356 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
357 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
358 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
359 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
360 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
361 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
362 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
363 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
365 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
367 /* Update potential sum for this i atom from the interaction with this j atom. */
368 velec = _mm_and_pd(velec,cutoff_mask);
369 velecsum = _mm_add_pd(velecsum,velec);
373 fscal = _mm_and_pd(fscal,cutoff_mask);
375 /* Update vectorial force */
376 fix1 = _mm_macc_pd(dx11,fscal,fix1);
377 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
378 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
380 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
381 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
382 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 if (gmx_mm_any_lt(rsq12,rcutoff2))
393 r12 = _mm_mul_pd(rsq12,rinv12);
395 /* EWALD ELECTROSTATICS */
397 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
398 ewrt = _mm_mul_pd(r12,ewtabscale);
399 ewitab = _mm_cvttpd_epi32(ewrt);
401 eweps = _mm_frcz_pd(ewrt);
403 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
405 twoeweps = _mm_add_pd(eweps,eweps);
406 ewitab = _mm_slli_epi32(ewitab,2);
407 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
408 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
409 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
410 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
411 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
412 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
413 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
414 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
415 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
416 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
418 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
420 /* Update potential sum for this i atom from the interaction with this j atom. */
421 velec = _mm_and_pd(velec,cutoff_mask);
422 velecsum = _mm_add_pd(velecsum,velec);
426 fscal = _mm_and_pd(fscal,cutoff_mask);
428 /* Update vectorial force */
429 fix1 = _mm_macc_pd(dx12,fscal,fix1);
430 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
431 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
433 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
434 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
435 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
439 /**************************
440 * CALCULATE INTERACTIONS *
441 **************************/
443 if (gmx_mm_any_lt(rsq13,rcutoff2))
446 r13 = _mm_mul_pd(rsq13,rinv13);
448 /* EWALD ELECTROSTATICS */
450 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
451 ewrt = _mm_mul_pd(r13,ewtabscale);
452 ewitab = _mm_cvttpd_epi32(ewrt);
454 eweps = _mm_frcz_pd(ewrt);
456 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
458 twoeweps = _mm_add_pd(eweps,eweps);
459 ewitab = _mm_slli_epi32(ewitab,2);
460 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
461 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
462 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
463 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
464 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
465 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
466 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
467 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
468 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
469 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
471 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
473 /* Update potential sum for this i atom from the interaction with this j atom. */
474 velec = _mm_and_pd(velec,cutoff_mask);
475 velecsum = _mm_add_pd(velecsum,velec);
479 fscal = _mm_and_pd(fscal,cutoff_mask);
481 /* Update vectorial force */
482 fix1 = _mm_macc_pd(dx13,fscal,fix1);
483 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
484 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
486 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
487 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
488 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
492 /**************************
493 * CALCULATE INTERACTIONS *
494 **************************/
496 if (gmx_mm_any_lt(rsq21,rcutoff2))
499 r21 = _mm_mul_pd(rsq21,rinv21);
501 /* EWALD ELECTROSTATICS */
503 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
504 ewrt = _mm_mul_pd(r21,ewtabscale);
505 ewitab = _mm_cvttpd_epi32(ewrt);
507 eweps = _mm_frcz_pd(ewrt);
509 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
511 twoeweps = _mm_add_pd(eweps,eweps);
512 ewitab = _mm_slli_epi32(ewitab,2);
513 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
514 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
515 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
516 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
517 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
518 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
519 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
520 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
521 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
522 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
524 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
526 /* Update potential sum for this i atom from the interaction with this j atom. */
527 velec = _mm_and_pd(velec,cutoff_mask);
528 velecsum = _mm_add_pd(velecsum,velec);
532 fscal = _mm_and_pd(fscal,cutoff_mask);
534 /* Update vectorial force */
535 fix2 = _mm_macc_pd(dx21,fscal,fix2);
536 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
537 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
539 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
540 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
541 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 if (gmx_mm_any_lt(rsq22,rcutoff2))
552 r22 = _mm_mul_pd(rsq22,rinv22);
554 /* EWALD ELECTROSTATICS */
556 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
557 ewrt = _mm_mul_pd(r22,ewtabscale);
558 ewitab = _mm_cvttpd_epi32(ewrt);
560 eweps = _mm_frcz_pd(ewrt);
562 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
564 twoeweps = _mm_add_pd(eweps,eweps);
565 ewitab = _mm_slli_epi32(ewitab,2);
566 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
567 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
568 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
569 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
570 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
571 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
572 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
573 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
574 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
575 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
577 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
579 /* Update potential sum for this i atom from the interaction with this j atom. */
580 velec = _mm_and_pd(velec,cutoff_mask);
581 velecsum = _mm_add_pd(velecsum,velec);
585 fscal = _mm_and_pd(fscal,cutoff_mask);
587 /* Update vectorial force */
588 fix2 = _mm_macc_pd(dx22,fscal,fix2);
589 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
590 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
592 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
593 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
594 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
598 /**************************
599 * CALCULATE INTERACTIONS *
600 **************************/
602 if (gmx_mm_any_lt(rsq23,rcutoff2))
605 r23 = _mm_mul_pd(rsq23,rinv23);
607 /* EWALD ELECTROSTATICS */
609 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
610 ewrt = _mm_mul_pd(r23,ewtabscale);
611 ewitab = _mm_cvttpd_epi32(ewrt);
613 eweps = _mm_frcz_pd(ewrt);
615 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
617 twoeweps = _mm_add_pd(eweps,eweps);
618 ewitab = _mm_slli_epi32(ewitab,2);
619 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
620 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
621 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
622 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
623 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
624 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
625 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
626 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
627 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
628 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
630 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
632 /* Update potential sum for this i atom from the interaction with this j atom. */
633 velec = _mm_and_pd(velec,cutoff_mask);
634 velecsum = _mm_add_pd(velecsum,velec);
638 fscal = _mm_and_pd(fscal,cutoff_mask);
640 /* Update vectorial force */
641 fix2 = _mm_macc_pd(dx23,fscal,fix2);
642 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
643 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
645 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
646 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
647 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
651 /**************************
652 * CALCULATE INTERACTIONS *
653 **************************/
655 if (gmx_mm_any_lt(rsq31,rcutoff2))
658 r31 = _mm_mul_pd(rsq31,rinv31);
660 /* EWALD ELECTROSTATICS */
662 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
663 ewrt = _mm_mul_pd(r31,ewtabscale);
664 ewitab = _mm_cvttpd_epi32(ewrt);
666 eweps = _mm_frcz_pd(ewrt);
668 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
670 twoeweps = _mm_add_pd(eweps,eweps);
671 ewitab = _mm_slli_epi32(ewitab,2);
672 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
673 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
674 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
675 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
676 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
677 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
678 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
679 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
680 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
681 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
683 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
685 /* Update potential sum for this i atom from the interaction with this j atom. */
686 velec = _mm_and_pd(velec,cutoff_mask);
687 velecsum = _mm_add_pd(velecsum,velec);
691 fscal = _mm_and_pd(fscal,cutoff_mask);
693 /* Update vectorial force */
694 fix3 = _mm_macc_pd(dx31,fscal,fix3);
695 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
696 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
698 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
699 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
700 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
704 /**************************
705 * CALCULATE INTERACTIONS *
706 **************************/
708 if (gmx_mm_any_lt(rsq32,rcutoff2))
711 r32 = _mm_mul_pd(rsq32,rinv32);
713 /* EWALD ELECTROSTATICS */
715 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
716 ewrt = _mm_mul_pd(r32,ewtabscale);
717 ewitab = _mm_cvttpd_epi32(ewrt);
719 eweps = _mm_frcz_pd(ewrt);
721 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
723 twoeweps = _mm_add_pd(eweps,eweps);
724 ewitab = _mm_slli_epi32(ewitab,2);
725 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
726 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
727 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
728 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
729 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
730 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
731 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
732 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
733 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
734 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
736 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
738 /* Update potential sum for this i atom from the interaction with this j atom. */
739 velec = _mm_and_pd(velec,cutoff_mask);
740 velecsum = _mm_add_pd(velecsum,velec);
744 fscal = _mm_and_pd(fscal,cutoff_mask);
746 /* Update vectorial force */
747 fix3 = _mm_macc_pd(dx32,fscal,fix3);
748 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
749 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
751 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
752 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
753 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
757 /**************************
758 * CALCULATE INTERACTIONS *
759 **************************/
761 if (gmx_mm_any_lt(rsq33,rcutoff2))
764 r33 = _mm_mul_pd(rsq33,rinv33);
766 /* EWALD ELECTROSTATICS */
768 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
769 ewrt = _mm_mul_pd(r33,ewtabscale);
770 ewitab = _mm_cvttpd_epi32(ewrt);
772 eweps = _mm_frcz_pd(ewrt);
774 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
776 twoeweps = _mm_add_pd(eweps,eweps);
777 ewitab = _mm_slli_epi32(ewitab,2);
778 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
779 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
780 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
781 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
782 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
783 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
784 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
785 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
786 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
787 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
789 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 velec = _mm_and_pd(velec,cutoff_mask);
793 velecsum = _mm_add_pd(velecsum,velec);
797 fscal = _mm_and_pd(fscal,cutoff_mask);
799 /* Update vectorial force */
800 fix3 = _mm_macc_pd(dx33,fscal,fix3);
801 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
802 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
804 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
805 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
806 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
810 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
812 /* Inner loop uses 488 flops */
819 j_coord_offsetA = DIM*jnrA;
821 /* load j atom coordinates */
822 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
823 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
824 &jy2,&jz2,&jx3,&jy3,&jz3);
826 /* Calculate displacement vector */
827 dx00 = _mm_sub_pd(ix0,jx0);
828 dy00 = _mm_sub_pd(iy0,jy0);
829 dz00 = _mm_sub_pd(iz0,jz0);
830 dx11 = _mm_sub_pd(ix1,jx1);
831 dy11 = _mm_sub_pd(iy1,jy1);
832 dz11 = _mm_sub_pd(iz1,jz1);
833 dx12 = _mm_sub_pd(ix1,jx2);
834 dy12 = _mm_sub_pd(iy1,jy2);
835 dz12 = _mm_sub_pd(iz1,jz2);
836 dx13 = _mm_sub_pd(ix1,jx3);
837 dy13 = _mm_sub_pd(iy1,jy3);
838 dz13 = _mm_sub_pd(iz1,jz3);
839 dx21 = _mm_sub_pd(ix2,jx1);
840 dy21 = _mm_sub_pd(iy2,jy1);
841 dz21 = _mm_sub_pd(iz2,jz1);
842 dx22 = _mm_sub_pd(ix2,jx2);
843 dy22 = _mm_sub_pd(iy2,jy2);
844 dz22 = _mm_sub_pd(iz2,jz2);
845 dx23 = _mm_sub_pd(ix2,jx3);
846 dy23 = _mm_sub_pd(iy2,jy3);
847 dz23 = _mm_sub_pd(iz2,jz3);
848 dx31 = _mm_sub_pd(ix3,jx1);
849 dy31 = _mm_sub_pd(iy3,jy1);
850 dz31 = _mm_sub_pd(iz3,jz1);
851 dx32 = _mm_sub_pd(ix3,jx2);
852 dy32 = _mm_sub_pd(iy3,jy2);
853 dz32 = _mm_sub_pd(iz3,jz2);
854 dx33 = _mm_sub_pd(ix3,jx3);
855 dy33 = _mm_sub_pd(iy3,jy3);
856 dz33 = _mm_sub_pd(iz3,jz3);
858 /* Calculate squared distance and things based on it */
859 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
860 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
861 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
862 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
863 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
864 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
865 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
866 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
867 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
868 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
870 rinv11 = gmx_mm_invsqrt_pd(rsq11);
871 rinv12 = gmx_mm_invsqrt_pd(rsq12);
872 rinv13 = gmx_mm_invsqrt_pd(rsq13);
873 rinv21 = gmx_mm_invsqrt_pd(rsq21);
874 rinv22 = gmx_mm_invsqrt_pd(rsq22);
875 rinv23 = gmx_mm_invsqrt_pd(rsq23);
876 rinv31 = gmx_mm_invsqrt_pd(rsq31);
877 rinv32 = gmx_mm_invsqrt_pd(rsq32);
878 rinv33 = gmx_mm_invsqrt_pd(rsq33);
880 rinvsq00 = gmx_mm_inv_pd(rsq00);
881 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
882 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
883 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
884 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
885 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
886 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
887 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
888 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
889 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
891 fjx0 = _mm_setzero_pd();
892 fjy0 = _mm_setzero_pd();
893 fjz0 = _mm_setzero_pd();
894 fjx1 = _mm_setzero_pd();
895 fjy1 = _mm_setzero_pd();
896 fjz1 = _mm_setzero_pd();
897 fjx2 = _mm_setzero_pd();
898 fjy2 = _mm_setzero_pd();
899 fjz2 = _mm_setzero_pd();
900 fjx3 = _mm_setzero_pd();
901 fjy3 = _mm_setzero_pd();
902 fjz3 = _mm_setzero_pd();
904 /**************************
905 * CALCULATE INTERACTIONS *
906 **************************/
908 if (gmx_mm_any_lt(rsq00,rcutoff2))
911 /* LENNARD-JONES DISPERSION/REPULSION */
913 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
914 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
915 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
916 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
917 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
918 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
920 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
922 /* Update potential sum for this i atom from the interaction with this j atom. */
923 vvdw = _mm_and_pd(vvdw,cutoff_mask);
924 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
925 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
929 fscal = _mm_and_pd(fscal,cutoff_mask);
931 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
933 /* Update vectorial force */
934 fix0 = _mm_macc_pd(dx00,fscal,fix0);
935 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
936 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
938 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
939 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
940 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 if (gmx_mm_any_lt(rsq11,rcutoff2))
951 r11 = _mm_mul_pd(rsq11,rinv11);
953 /* EWALD ELECTROSTATICS */
955 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
956 ewrt = _mm_mul_pd(r11,ewtabscale);
957 ewitab = _mm_cvttpd_epi32(ewrt);
959 eweps = _mm_frcz_pd(ewrt);
961 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
963 twoeweps = _mm_add_pd(eweps,eweps);
964 ewitab = _mm_slli_epi32(ewitab,2);
965 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
966 ewtabD = _mm_setzero_pd();
967 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
968 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
969 ewtabFn = _mm_setzero_pd();
970 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
971 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
972 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
973 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
974 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
976 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
978 /* Update potential sum for this i atom from the interaction with this j atom. */
979 velec = _mm_and_pd(velec,cutoff_mask);
980 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
981 velecsum = _mm_add_pd(velecsum,velec);
985 fscal = _mm_and_pd(fscal,cutoff_mask);
987 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
989 /* Update vectorial force */
990 fix1 = _mm_macc_pd(dx11,fscal,fix1);
991 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
992 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
994 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
995 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
996 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1000 /**************************
1001 * CALCULATE INTERACTIONS *
1002 **************************/
1004 if (gmx_mm_any_lt(rsq12,rcutoff2))
1007 r12 = _mm_mul_pd(rsq12,rinv12);
1009 /* EWALD ELECTROSTATICS */
1011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1012 ewrt = _mm_mul_pd(r12,ewtabscale);
1013 ewitab = _mm_cvttpd_epi32(ewrt);
1015 eweps = _mm_frcz_pd(ewrt);
1017 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1019 twoeweps = _mm_add_pd(eweps,eweps);
1020 ewitab = _mm_slli_epi32(ewitab,2);
1021 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1022 ewtabD = _mm_setzero_pd();
1023 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1024 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1025 ewtabFn = _mm_setzero_pd();
1026 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1027 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1028 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1029 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1030 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1032 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1034 /* Update potential sum for this i atom from the interaction with this j atom. */
1035 velec = _mm_and_pd(velec,cutoff_mask);
1036 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1037 velecsum = _mm_add_pd(velecsum,velec);
1041 fscal = _mm_and_pd(fscal,cutoff_mask);
1043 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1045 /* Update vectorial force */
1046 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1047 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1048 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1050 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1051 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1052 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1056 /**************************
1057 * CALCULATE INTERACTIONS *
1058 **************************/
1060 if (gmx_mm_any_lt(rsq13,rcutoff2))
1063 r13 = _mm_mul_pd(rsq13,rinv13);
1065 /* EWALD ELECTROSTATICS */
1067 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1068 ewrt = _mm_mul_pd(r13,ewtabscale);
1069 ewitab = _mm_cvttpd_epi32(ewrt);
1071 eweps = _mm_frcz_pd(ewrt);
1073 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1075 twoeweps = _mm_add_pd(eweps,eweps);
1076 ewitab = _mm_slli_epi32(ewitab,2);
1077 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1078 ewtabD = _mm_setzero_pd();
1079 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1080 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1081 ewtabFn = _mm_setzero_pd();
1082 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1083 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1084 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1085 velec = _mm_mul_pd(qq13,_mm_sub_pd(_mm_sub_pd(rinv13,sh_ewald),velec));
1086 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1088 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1090 /* Update potential sum for this i atom from the interaction with this j atom. */
1091 velec = _mm_and_pd(velec,cutoff_mask);
1092 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1093 velecsum = _mm_add_pd(velecsum,velec);
1097 fscal = _mm_and_pd(fscal,cutoff_mask);
1099 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1101 /* Update vectorial force */
1102 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1103 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1104 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1106 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1107 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1108 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1112 /**************************
1113 * CALCULATE INTERACTIONS *
1114 **************************/
1116 if (gmx_mm_any_lt(rsq21,rcutoff2))
1119 r21 = _mm_mul_pd(rsq21,rinv21);
1121 /* EWALD ELECTROSTATICS */
1123 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1124 ewrt = _mm_mul_pd(r21,ewtabscale);
1125 ewitab = _mm_cvttpd_epi32(ewrt);
1127 eweps = _mm_frcz_pd(ewrt);
1129 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1131 twoeweps = _mm_add_pd(eweps,eweps);
1132 ewitab = _mm_slli_epi32(ewitab,2);
1133 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1134 ewtabD = _mm_setzero_pd();
1135 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1136 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1137 ewtabFn = _mm_setzero_pd();
1138 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1139 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1140 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1141 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1142 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1144 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1146 /* Update potential sum for this i atom from the interaction with this j atom. */
1147 velec = _mm_and_pd(velec,cutoff_mask);
1148 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1149 velecsum = _mm_add_pd(velecsum,velec);
1153 fscal = _mm_and_pd(fscal,cutoff_mask);
1155 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1157 /* Update vectorial force */
1158 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1159 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1160 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1162 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1163 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1164 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1168 /**************************
1169 * CALCULATE INTERACTIONS *
1170 **************************/
1172 if (gmx_mm_any_lt(rsq22,rcutoff2))
1175 r22 = _mm_mul_pd(rsq22,rinv22);
1177 /* EWALD ELECTROSTATICS */
1179 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1180 ewrt = _mm_mul_pd(r22,ewtabscale);
1181 ewitab = _mm_cvttpd_epi32(ewrt);
1183 eweps = _mm_frcz_pd(ewrt);
1185 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1187 twoeweps = _mm_add_pd(eweps,eweps);
1188 ewitab = _mm_slli_epi32(ewitab,2);
1189 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1190 ewtabD = _mm_setzero_pd();
1191 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1192 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1193 ewtabFn = _mm_setzero_pd();
1194 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1195 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1196 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1197 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1198 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1200 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1202 /* Update potential sum for this i atom from the interaction with this j atom. */
1203 velec = _mm_and_pd(velec,cutoff_mask);
1204 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1205 velecsum = _mm_add_pd(velecsum,velec);
1209 fscal = _mm_and_pd(fscal,cutoff_mask);
1211 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1213 /* Update vectorial force */
1214 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1215 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1216 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1218 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1219 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1220 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1224 /**************************
1225 * CALCULATE INTERACTIONS *
1226 **************************/
1228 if (gmx_mm_any_lt(rsq23,rcutoff2))
1231 r23 = _mm_mul_pd(rsq23,rinv23);
1233 /* EWALD ELECTROSTATICS */
1235 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1236 ewrt = _mm_mul_pd(r23,ewtabscale);
1237 ewitab = _mm_cvttpd_epi32(ewrt);
1239 eweps = _mm_frcz_pd(ewrt);
1241 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1243 twoeweps = _mm_add_pd(eweps,eweps);
1244 ewitab = _mm_slli_epi32(ewitab,2);
1245 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1246 ewtabD = _mm_setzero_pd();
1247 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1248 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1249 ewtabFn = _mm_setzero_pd();
1250 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1251 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1252 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1253 velec = _mm_mul_pd(qq23,_mm_sub_pd(_mm_sub_pd(rinv23,sh_ewald),velec));
1254 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1256 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
1258 /* Update potential sum for this i atom from the interaction with this j atom. */
1259 velec = _mm_and_pd(velec,cutoff_mask);
1260 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1261 velecsum = _mm_add_pd(velecsum,velec);
1265 fscal = _mm_and_pd(fscal,cutoff_mask);
1267 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1269 /* Update vectorial force */
1270 fix2 = _mm_macc_pd(dx23,fscal,fix2);
1271 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
1272 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
1274 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
1275 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
1276 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
1280 /**************************
1281 * CALCULATE INTERACTIONS *
1282 **************************/
1284 if (gmx_mm_any_lt(rsq31,rcutoff2))
1287 r31 = _mm_mul_pd(rsq31,rinv31);
1289 /* EWALD ELECTROSTATICS */
1291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1292 ewrt = _mm_mul_pd(r31,ewtabscale);
1293 ewitab = _mm_cvttpd_epi32(ewrt);
1295 eweps = _mm_frcz_pd(ewrt);
1297 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1299 twoeweps = _mm_add_pd(eweps,eweps);
1300 ewitab = _mm_slli_epi32(ewitab,2);
1301 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1302 ewtabD = _mm_setzero_pd();
1303 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1304 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1305 ewtabFn = _mm_setzero_pd();
1306 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1307 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1308 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1309 velec = _mm_mul_pd(qq31,_mm_sub_pd(_mm_sub_pd(rinv31,sh_ewald),velec));
1310 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
1312 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
1314 /* Update potential sum for this i atom from the interaction with this j atom. */
1315 velec = _mm_and_pd(velec,cutoff_mask);
1316 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1317 velecsum = _mm_add_pd(velecsum,velec);
1321 fscal = _mm_and_pd(fscal,cutoff_mask);
1323 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1325 /* Update vectorial force */
1326 fix3 = _mm_macc_pd(dx31,fscal,fix3);
1327 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
1328 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
1330 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
1331 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
1332 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
1336 /**************************
1337 * CALCULATE INTERACTIONS *
1338 **************************/
1340 if (gmx_mm_any_lt(rsq32,rcutoff2))
1343 r32 = _mm_mul_pd(rsq32,rinv32);
1345 /* EWALD ELECTROSTATICS */
1347 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1348 ewrt = _mm_mul_pd(r32,ewtabscale);
1349 ewitab = _mm_cvttpd_epi32(ewrt);
1351 eweps = _mm_frcz_pd(ewrt);
1353 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1355 twoeweps = _mm_add_pd(eweps,eweps);
1356 ewitab = _mm_slli_epi32(ewitab,2);
1357 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1358 ewtabD = _mm_setzero_pd();
1359 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1360 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1361 ewtabFn = _mm_setzero_pd();
1362 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1363 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1364 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1365 velec = _mm_mul_pd(qq32,_mm_sub_pd(_mm_sub_pd(rinv32,sh_ewald),velec));
1366 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
1368 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
1370 /* Update potential sum for this i atom from the interaction with this j atom. */
1371 velec = _mm_and_pd(velec,cutoff_mask);
1372 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1373 velecsum = _mm_add_pd(velecsum,velec);
1377 fscal = _mm_and_pd(fscal,cutoff_mask);
1379 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1381 /* Update vectorial force */
1382 fix3 = _mm_macc_pd(dx32,fscal,fix3);
1383 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
1384 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
1386 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
1387 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
1388 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
1392 /**************************
1393 * CALCULATE INTERACTIONS *
1394 **************************/
1396 if (gmx_mm_any_lt(rsq33,rcutoff2))
1399 r33 = _mm_mul_pd(rsq33,rinv33);
1401 /* EWALD ELECTROSTATICS */
1403 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1404 ewrt = _mm_mul_pd(r33,ewtabscale);
1405 ewitab = _mm_cvttpd_epi32(ewrt);
1407 eweps = _mm_frcz_pd(ewrt);
1409 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1411 twoeweps = _mm_add_pd(eweps,eweps);
1412 ewitab = _mm_slli_epi32(ewitab,2);
1413 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1414 ewtabD = _mm_setzero_pd();
1415 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1416 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1417 ewtabFn = _mm_setzero_pd();
1418 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1419 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1420 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1421 velec = _mm_mul_pd(qq33,_mm_sub_pd(_mm_sub_pd(rinv33,sh_ewald),velec));
1422 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
1424 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
1426 /* Update potential sum for this i atom from the interaction with this j atom. */
1427 velec = _mm_and_pd(velec,cutoff_mask);
1428 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1429 velecsum = _mm_add_pd(velecsum,velec);
1433 fscal = _mm_and_pd(fscal,cutoff_mask);
1435 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1437 /* Update vectorial force */
1438 fix3 = _mm_macc_pd(dx33,fscal,fix3);
1439 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
1440 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
1442 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
1443 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
1444 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
1448 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1450 /* Inner loop uses 488 flops */
1453 /* End of innermost loop */
1455 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1456 f+i_coord_offset,fshift+i_shift_offset);
1459 /* Update potential energies */
1460 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1461 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1463 /* Increment number of inner iterations */
1464 inneriter += j_index_end - j_index_start;
1466 /* Outer loop uses 26 flops */
1469 /* Increment number of outer iterations */
1472 /* Update outer/inner flops */
1474 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*488);
1477 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double
1478 * Electrostatics interaction: Ewald
1479 * VdW interaction: LennardJones
1480 * Geometry: Water4-Water4
1481 * Calculate force/pot: Force
1484 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_128_fma_double
1485 (t_nblist * gmx_restrict nlist,
1486 rvec * gmx_restrict xx,
1487 rvec * gmx_restrict ff,
1488 t_forcerec * gmx_restrict fr,
1489 t_mdatoms * gmx_restrict mdatoms,
1490 nb_kernel_data_t * gmx_restrict kernel_data,
1491 t_nrnb * gmx_restrict nrnb)
1493 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1494 * just 0 for non-waters.
1495 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1496 * jnr indices corresponding to data put in the four positions in the SIMD register.
1498 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1499 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1501 int j_coord_offsetA,j_coord_offsetB;
1502 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1503 real rcutoff_scalar;
1504 real *shiftvec,*fshift,*x,*f;
1505 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1507 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1509 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1511 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1513 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1514 int vdwjidx0A,vdwjidx0B;
1515 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1516 int vdwjidx1A,vdwjidx1B;
1517 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1518 int vdwjidx2A,vdwjidx2B;
1519 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1520 int vdwjidx3A,vdwjidx3B;
1521 __m128d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1522 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1523 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1524 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1525 __m128d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1526 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1527 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1528 __m128d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1529 __m128d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1530 __m128d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1531 __m128d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1532 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1535 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1538 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1539 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1541 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1543 __m128d dummy_mask,cutoff_mask;
1544 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1545 __m128d one = _mm_set1_pd(1.0);
1546 __m128d two = _mm_set1_pd(2.0);
1552 jindex = nlist->jindex;
1554 shiftidx = nlist->shift;
1556 shiftvec = fr->shift_vec[0];
1557 fshift = fr->fshift[0];
1558 facel = _mm_set1_pd(fr->epsfac);
1559 charge = mdatoms->chargeA;
1560 nvdwtype = fr->ntype;
1561 vdwparam = fr->nbfp;
1562 vdwtype = mdatoms->typeA;
1564 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1565 ewtab = fr->ic->tabq_coul_F;
1566 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1567 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1569 /* Setup water-specific parameters */
1570 inr = nlist->iinr[0];
1571 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1572 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1573 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1574 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1576 jq1 = _mm_set1_pd(charge[inr+1]);
1577 jq2 = _mm_set1_pd(charge[inr+2]);
1578 jq3 = _mm_set1_pd(charge[inr+3]);
1579 vdwjidx0A = 2*vdwtype[inr+0];
1580 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1581 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1582 qq11 = _mm_mul_pd(iq1,jq1);
1583 qq12 = _mm_mul_pd(iq1,jq2);
1584 qq13 = _mm_mul_pd(iq1,jq3);
1585 qq21 = _mm_mul_pd(iq2,jq1);
1586 qq22 = _mm_mul_pd(iq2,jq2);
1587 qq23 = _mm_mul_pd(iq2,jq3);
1588 qq31 = _mm_mul_pd(iq3,jq1);
1589 qq32 = _mm_mul_pd(iq3,jq2);
1590 qq33 = _mm_mul_pd(iq3,jq3);
1592 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1593 rcutoff_scalar = fr->rcoulomb;
1594 rcutoff = _mm_set1_pd(rcutoff_scalar);
1595 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1597 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1598 rvdw = _mm_set1_pd(fr->rvdw);
1600 /* Avoid stupid compiler warnings */
1602 j_coord_offsetA = 0;
1603 j_coord_offsetB = 0;
1608 /* Start outer loop over neighborlists */
1609 for(iidx=0; iidx<nri; iidx++)
1611 /* Load shift vector for this list */
1612 i_shift_offset = DIM*shiftidx[iidx];
1614 /* Load limits for loop over neighbors */
1615 j_index_start = jindex[iidx];
1616 j_index_end = jindex[iidx+1];
1618 /* Get outer coordinate index */
1620 i_coord_offset = DIM*inr;
1622 /* Load i particle coords and add shift vector */
1623 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1624 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1626 fix0 = _mm_setzero_pd();
1627 fiy0 = _mm_setzero_pd();
1628 fiz0 = _mm_setzero_pd();
1629 fix1 = _mm_setzero_pd();
1630 fiy1 = _mm_setzero_pd();
1631 fiz1 = _mm_setzero_pd();
1632 fix2 = _mm_setzero_pd();
1633 fiy2 = _mm_setzero_pd();
1634 fiz2 = _mm_setzero_pd();
1635 fix3 = _mm_setzero_pd();
1636 fiy3 = _mm_setzero_pd();
1637 fiz3 = _mm_setzero_pd();
1639 /* Start inner kernel loop */
1640 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1643 /* Get j neighbor index, and coordinate index */
1645 jnrB = jjnr[jidx+1];
1646 j_coord_offsetA = DIM*jnrA;
1647 j_coord_offsetB = DIM*jnrB;
1649 /* load j atom coordinates */
1650 gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1651 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1652 &jy2,&jz2,&jx3,&jy3,&jz3);
1654 /* Calculate displacement vector */
1655 dx00 = _mm_sub_pd(ix0,jx0);
1656 dy00 = _mm_sub_pd(iy0,jy0);
1657 dz00 = _mm_sub_pd(iz0,jz0);
1658 dx11 = _mm_sub_pd(ix1,jx1);
1659 dy11 = _mm_sub_pd(iy1,jy1);
1660 dz11 = _mm_sub_pd(iz1,jz1);
1661 dx12 = _mm_sub_pd(ix1,jx2);
1662 dy12 = _mm_sub_pd(iy1,jy2);
1663 dz12 = _mm_sub_pd(iz1,jz2);
1664 dx13 = _mm_sub_pd(ix1,jx3);
1665 dy13 = _mm_sub_pd(iy1,jy3);
1666 dz13 = _mm_sub_pd(iz1,jz3);
1667 dx21 = _mm_sub_pd(ix2,jx1);
1668 dy21 = _mm_sub_pd(iy2,jy1);
1669 dz21 = _mm_sub_pd(iz2,jz1);
1670 dx22 = _mm_sub_pd(ix2,jx2);
1671 dy22 = _mm_sub_pd(iy2,jy2);
1672 dz22 = _mm_sub_pd(iz2,jz2);
1673 dx23 = _mm_sub_pd(ix2,jx3);
1674 dy23 = _mm_sub_pd(iy2,jy3);
1675 dz23 = _mm_sub_pd(iz2,jz3);
1676 dx31 = _mm_sub_pd(ix3,jx1);
1677 dy31 = _mm_sub_pd(iy3,jy1);
1678 dz31 = _mm_sub_pd(iz3,jz1);
1679 dx32 = _mm_sub_pd(ix3,jx2);
1680 dy32 = _mm_sub_pd(iy3,jy2);
1681 dz32 = _mm_sub_pd(iz3,jz2);
1682 dx33 = _mm_sub_pd(ix3,jx3);
1683 dy33 = _mm_sub_pd(iy3,jy3);
1684 dz33 = _mm_sub_pd(iz3,jz3);
1686 /* Calculate squared distance and things based on it */
1687 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1688 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1689 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1690 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1691 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1692 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1693 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1694 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1695 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1696 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1698 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1699 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1700 rinv13 = gmx_mm_invsqrt_pd(rsq13);
1701 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1702 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1703 rinv23 = gmx_mm_invsqrt_pd(rsq23);
1704 rinv31 = gmx_mm_invsqrt_pd(rsq31);
1705 rinv32 = gmx_mm_invsqrt_pd(rsq32);
1706 rinv33 = gmx_mm_invsqrt_pd(rsq33);
1708 rinvsq00 = gmx_mm_inv_pd(rsq00);
1709 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1710 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1711 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
1712 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1713 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1714 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
1715 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
1716 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
1717 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
1719 fjx0 = _mm_setzero_pd();
1720 fjy0 = _mm_setzero_pd();
1721 fjz0 = _mm_setzero_pd();
1722 fjx1 = _mm_setzero_pd();
1723 fjy1 = _mm_setzero_pd();
1724 fjz1 = _mm_setzero_pd();
1725 fjx2 = _mm_setzero_pd();
1726 fjy2 = _mm_setzero_pd();
1727 fjz2 = _mm_setzero_pd();
1728 fjx3 = _mm_setzero_pd();
1729 fjy3 = _mm_setzero_pd();
1730 fjz3 = _mm_setzero_pd();
1732 /**************************
1733 * CALCULATE INTERACTIONS *
1734 **************************/
1736 if (gmx_mm_any_lt(rsq00,rcutoff2))
1739 /* LENNARD-JONES DISPERSION/REPULSION */
1741 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1742 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1744 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1748 fscal = _mm_and_pd(fscal,cutoff_mask);
1750 /* Update vectorial force */
1751 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1752 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1753 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1755 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1756 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1757 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1761 /**************************
1762 * CALCULATE INTERACTIONS *
1763 **************************/
1765 if (gmx_mm_any_lt(rsq11,rcutoff2))
1768 r11 = _mm_mul_pd(rsq11,rinv11);
1770 /* EWALD ELECTROSTATICS */
1772 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1773 ewrt = _mm_mul_pd(r11,ewtabscale);
1774 ewitab = _mm_cvttpd_epi32(ewrt);
1776 eweps = _mm_frcz_pd(ewrt);
1778 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1780 twoeweps = _mm_add_pd(eweps,eweps);
1781 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1783 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1784 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1786 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1790 fscal = _mm_and_pd(fscal,cutoff_mask);
1792 /* Update vectorial force */
1793 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1794 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1795 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1797 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1798 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1799 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 if (gmx_mm_any_lt(rsq12,rcutoff2))
1810 r12 = _mm_mul_pd(rsq12,rinv12);
1812 /* EWALD ELECTROSTATICS */
1814 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1815 ewrt = _mm_mul_pd(r12,ewtabscale);
1816 ewitab = _mm_cvttpd_epi32(ewrt);
1818 eweps = _mm_frcz_pd(ewrt);
1820 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1822 twoeweps = _mm_add_pd(eweps,eweps);
1823 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1825 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1826 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1828 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1832 fscal = _mm_and_pd(fscal,cutoff_mask);
1834 /* Update vectorial force */
1835 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1836 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1837 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1839 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1840 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1841 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1845 /**************************
1846 * CALCULATE INTERACTIONS *
1847 **************************/
1849 if (gmx_mm_any_lt(rsq13,rcutoff2))
1852 r13 = _mm_mul_pd(rsq13,rinv13);
1854 /* EWALD ELECTROSTATICS */
1856 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1857 ewrt = _mm_mul_pd(r13,ewtabscale);
1858 ewitab = _mm_cvttpd_epi32(ewrt);
1860 eweps = _mm_frcz_pd(ewrt);
1862 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1864 twoeweps = _mm_add_pd(eweps,eweps);
1865 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1867 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1868 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
1870 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
1874 fscal = _mm_and_pd(fscal,cutoff_mask);
1876 /* Update vectorial force */
1877 fix1 = _mm_macc_pd(dx13,fscal,fix1);
1878 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
1879 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
1881 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
1882 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
1883 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
1887 /**************************
1888 * CALCULATE INTERACTIONS *
1889 **************************/
1891 if (gmx_mm_any_lt(rsq21,rcutoff2))
1894 r21 = _mm_mul_pd(rsq21,rinv21);
1896 /* EWALD ELECTROSTATICS */
1898 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1899 ewrt = _mm_mul_pd(r21,ewtabscale);
1900 ewitab = _mm_cvttpd_epi32(ewrt);
1902 eweps = _mm_frcz_pd(ewrt);
1904 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1906 twoeweps = _mm_add_pd(eweps,eweps);
1907 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1909 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1910 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1912 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1916 fscal = _mm_and_pd(fscal,cutoff_mask);
1918 /* Update vectorial force */
1919 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1920 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1921 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1923 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1924 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1925 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 if (gmx_mm_any_lt(rsq22,rcutoff2))
1936 r22 = _mm_mul_pd(rsq22,rinv22);
1938 /* EWALD ELECTROSTATICS */
1940 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1941 ewrt = _mm_mul_pd(r22,ewtabscale);
1942 ewitab = _mm_cvttpd_epi32(ewrt);
1944 eweps = _mm_frcz_pd(ewrt);
1946 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1948 twoeweps = _mm_add_pd(eweps,eweps);
1949 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1951 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1952 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1954 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1958 fscal = _mm_and_pd(fscal,cutoff_mask);
1960 /* Update vectorial force */
1961 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1962 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1963 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1965 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1966 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1967 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1971 /**************************
1972 * CALCULATE INTERACTIONS *
1973 **************************/
1975 if (gmx_mm_any_lt(rsq23,rcutoff2))
1978 r23 = _mm_mul_pd(rsq23,rinv23);
1980 /* EWALD ELECTROSTATICS */
1982 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1983 ewrt = _mm_mul_pd(r23,ewtabscale);
1984 ewitab = _mm_cvttpd_epi32(ewrt);
1986 eweps = _mm_frcz_pd(ewrt);
1988 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1990 twoeweps = _mm_add_pd(eweps,eweps);
1991 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1993 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1994 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
1996 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2000 fscal = _mm_and_pd(fscal,cutoff_mask);
2002 /* Update vectorial force */
2003 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2004 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2005 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2007 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2008 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2009 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2013 /**************************
2014 * CALCULATE INTERACTIONS *
2015 **************************/
2017 if (gmx_mm_any_lt(rsq31,rcutoff2))
2020 r31 = _mm_mul_pd(rsq31,rinv31);
2022 /* EWALD ELECTROSTATICS */
2024 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2025 ewrt = _mm_mul_pd(r31,ewtabscale);
2026 ewitab = _mm_cvttpd_epi32(ewrt);
2028 eweps = _mm_frcz_pd(ewrt);
2030 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2032 twoeweps = _mm_add_pd(eweps,eweps);
2033 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2035 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2036 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2038 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2042 fscal = _mm_and_pd(fscal,cutoff_mask);
2044 /* Update vectorial force */
2045 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2046 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2047 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2049 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2050 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2051 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2055 /**************************
2056 * CALCULATE INTERACTIONS *
2057 **************************/
2059 if (gmx_mm_any_lt(rsq32,rcutoff2))
2062 r32 = _mm_mul_pd(rsq32,rinv32);
2064 /* EWALD ELECTROSTATICS */
2066 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2067 ewrt = _mm_mul_pd(r32,ewtabscale);
2068 ewitab = _mm_cvttpd_epi32(ewrt);
2070 eweps = _mm_frcz_pd(ewrt);
2072 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2074 twoeweps = _mm_add_pd(eweps,eweps);
2075 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2077 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2078 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2080 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2084 fscal = _mm_and_pd(fscal,cutoff_mask);
2086 /* Update vectorial force */
2087 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2088 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2089 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2091 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2092 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2093 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2097 /**************************
2098 * CALCULATE INTERACTIONS *
2099 **************************/
2101 if (gmx_mm_any_lt(rsq33,rcutoff2))
2104 r33 = _mm_mul_pd(rsq33,rinv33);
2106 /* EWALD ELECTROSTATICS */
2108 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109 ewrt = _mm_mul_pd(r33,ewtabscale);
2110 ewitab = _mm_cvttpd_epi32(ewrt);
2112 eweps = _mm_frcz_pd(ewrt);
2114 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2116 twoeweps = _mm_add_pd(eweps,eweps);
2117 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
2119 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2120 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2122 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2126 fscal = _mm_and_pd(fscal,cutoff_mask);
2128 /* Update vectorial force */
2129 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2130 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2131 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2133 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2134 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2135 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2139 gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2141 /* Inner loop uses 414 flops */
2144 if(jidx<j_index_end)
2148 j_coord_offsetA = DIM*jnrA;
2150 /* load j atom coordinates */
2151 gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2152 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2153 &jy2,&jz2,&jx3,&jy3,&jz3);
2155 /* Calculate displacement vector */
2156 dx00 = _mm_sub_pd(ix0,jx0);
2157 dy00 = _mm_sub_pd(iy0,jy0);
2158 dz00 = _mm_sub_pd(iz0,jz0);
2159 dx11 = _mm_sub_pd(ix1,jx1);
2160 dy11 = _mm_sub_pd(iy1,jy1);
2161 dz11 = _mm_sub_pd(iz1,jz1);
2162 dx12 = _mm_sub_pd(ix1,jx2);
2163 dy12 = _mm_sub_pd(iy1,jy2);
2164 dz12 = _mm_sub_pd(iz1,jz2);
2165 dx13 = _mm_sub_pd(ix1,jx3);
2166 dy13 = _mm_sub_pd(iy1,jy3);
2167 dz13 = _mm_sub_pd(iz1,jz3);
2168 dx21 = _mm_sub_pd(ix2,jx1);
2169 dy21 = _mm_sub_pd(iy2,jy1);
2170 dz21 = _mm_sub_pd(iz2,jz1);
2171 dx22 = _mm_sub_pd(ix2,jx2);
2172 dy22 = _mm_sub_pd(iy2,jy2);
2173 dz22 = _mm_sub_pd(iz2,jz2);
2174 dx23 = _mm_sub_pd(ix2,jx3);
2175 dy23 = _mm_sub_pd(iy2,jy3);
2176 dz23 = _mm_sub_pd(iz2,jz3);
2177 dx31 = _mm_sub_pd(ix3,jx1);
2178 dy31 = _mm_sub_pd(iy3,jy1);
2179 dz31 = _mm_sub_pd(iz3,jz1);
2180 dx32 = _mm_sub_pd(ix3,jx2);
2181 dy32 = _mm_sub_pd(iy3,jy2);
2182 dz32 = _mm_sub_pd(iz3,jz2);
2183 dx33 = _mm_sub_pd(ix3,jx3);
2184 dy33 = _mm_sub_pd(iy3,jy3);
2185 dz33 = _mm_sub_pd(iz3,jz3);
2187 /* Calculate squared distance and things based on it */
2188 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2189 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2190 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2191 rsq13 = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2192 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2193 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2194 rsq23 = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2195 rsq31 = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2196 rsq32 = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2197 rsq33 = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2199 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2200 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2201 rinv13 = gmx_mm_invsqrt_pd(rsq13);
2202 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2203 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2204 rinv23 = gmx_mm_invsqrt_pd(rsq23);
2205 rinv31 = gmx_mm_invsqrt_pd(rsq31);
2206 rinv32 = gmx_mm_invsqrt_pd(rsq32);
2207 rinv33 = gmx_mm_invsqrt_pd(rsq33);
2209 rinvsq00 = gmx_mm_inv_pd(rsq00);
2210 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2211 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2212 rinvsq13 = _mm_mul_pd(rinv13,rinv13);
2213 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2214 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2215 rinvsq23 = _mm_mul_pd(rinv23,rinv23);
2216 rinvsq31 = _mm_mul_pd(rinv31,rinv31);
2217 rinvsq32 = _mm_mul_pd(rinv32,rinv32);
2218 rinvsq33 = _mm_mul_pd(rinv33,rinv33);
2220 fjx0 = _mm_setzero_pd();
2221 fjy0 = _mm_setzero_pd();
2222 fjz0 = _mm_setzero_pd();
2223 fjx1 = _mm_setzero_pd();
2224 fjy1 = _mm_setzero_pd();
2225 fjz1 = _mm_setzero_pd();
2226 fjx2 = _mm_setzero_pd();
2227 fjy2 = _mm_setzero_pd();
2228 fjz2 = _mm_setzero_pd();
2229 fjx3 = _mm_setzero_pd();
2230 fjy3 = _mm_setzero_pd();
2231 fjz3 = _mm_setzero_pd();
2233 /**************************
2234 * CALCULATE INTERACTIONS *
2235 **************************/
2237 if (gmx_mm_any_lt(rsq00,rcutoff2))
2240 /* LENNARD-JONES DISPERSION/REPULSION */
2242 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2243 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2245 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2249 fscal = _mm_and_pd(fscal,cutoff_mask);
2251 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2253 /* Update vectorial force */
2254 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2255 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2256 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2258 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2259 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2260 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2264 /**************************
2265 * CALCULATE INTERACTIONS *
2266 **************************/
2268 if (gmx_mm_any_lt(rsq11,rcutoff2))
2271 r11 = _mm_mul_pd(rsq11,rinv11);
2273 /* EWALD ELECTROSTATICS */
2275 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2276 ewrt = _mm_mul_pd(r11,ewtabscale);
2277 ewitab = _mm_cvttpd_epi32(ewrt);
2279 eweps = _mm_frcz_pd(ewrt);
2281 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2283 twoeweps = _mm_add_pd(eweps,eweps);
2284 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2285 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2286 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2288 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2292 fscal = _mm_and_pd(fscal,cutoff_mask);
2294 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2296 /* Update vectorial force */
2297 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2298 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2299 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2301 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2302 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2303 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2307 /**************************
2308 * CALCULATE INTERACTIONS *
2309 **************************/
2311 if (gmx_mm_any_lt(rsq12,rcutoff2))
2314 r12 = _mm_mul_pd(rsq12,rinv12);
2316 /* EWALD ELECTROSTATICS */
2318 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2319 ewrt = _mm_mul_pd(r12,ewtabscale);
2320 ewitab = _mm_cvttpd_epi32(ewrt);
2322 eweps = _mm_frcz_pd(ewrt);
2324 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2326 twoeweps = _mm_add_pd(eweps,eweps);
2327 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2328 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2329 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2331 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2335 fscal = _mm_and_pd(fscal,cutoff_mask);
2337 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2339 /* Update vectorial force */
2340 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2341 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2342 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2344 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2345 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2346 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2350 /**************************
2351 * CALCULATE INTERACTIONS *
2352 **************************/
2354 if (gmx_mm_any_lt(rsq13,rcutoff2))
2357 r13 = _mm_mul_pd(rsq13,rinv13);
2359 /* EWALD ELECTROSTATICS */
2361 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2362 ewrt = _mm_mul_pd(r13,ewtabscale);
2363 ewitab = _mm_cvttpd_epi32(ewrt);
2365 eweps = _mm_frcz_pd(ewrt);
2367 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2369 twoeweps = _mm_add_pd(eweps,eweps);
2370 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2371 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2372 felec = _mm_mul_pd(_mm_mul_pd(qq13,rinv13),_mm_sub_pd(rinvsq13,felec));
2374 cutoff_mask = _mm_cmplt_pd(rsq13,rcutoff2);
2378 fscal = _mm_and_pd(fscal,cutoff_mask);
2380 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2382 /* Update vectorial force */
2383 fix1 = _mm_macc_pd(dx13,fscal,fix1);
2384 fiy1 = _mm_macc_pd(dy13,fscal,fiy1);
2385 fiz1 = _mm_macc_pd(dz13,fscal,fiz1);
2387 fjx3 = _mm_macc_pd(dx13,fscal,fjx3);
2388 fjy3 = _mm_macc_pd(dy13,fscal,fjy3);
2389 fjz3 = _mm_macc_pd(dz13,fscal,fjz3);
2393 /**************************
2394 * CALCULATE INTERACTIONS *
2395 **************************/
2397 if (gmx_mm_any_lt(rsq21,rcutoff2))
2400 r21 = _mm_mul_pd(rsq21,rinv21);
2402 /* EWALD ELECTROSTATICS */
2404 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2405 ewrt = _mm_mul_pd(r21,ewtabscale);
2406 ewitab = _mm_cvttpd_epi32(ewrt);
2408 eweps = _mm_frcz_pd(ewrt);
2410 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2412 twoeweps = _mm_add_pd(eweps,eweps);
2413 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2414 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2415 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2417 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2421 fscal = _mm_and_pd(fscal,cutoff_mask);
2423 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2425 /* Update vectorial force */
2426 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2427 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2428 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2430 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2431 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2432 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2436 /**************************
2437 * CALCULATE INTERACTIONS *
2438 **************************/
2440 if (gmx_mm_any_lt(rsq22,rcutoff2))
2443 r22 = _mm_mul_pd(rsq22,rinv22);
2445 /* EWALD ELECTROSTATICS */
2447 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2448 ewrt = _mm_mul_pd(r22,ewtabscale);
2449 ewitab = _mm_cvttpd_epi32(ewrt);
2451 eweps = _mm_frcz_pd(ewrt);
2453 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2455 twoeweps = _mm_add_pd(eweps,eweps);
2456 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2457 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2458 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2460 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2464 fscal = _mm_and_pd(fscal,cutoff_mask);
2466 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2468 /* Update vectorial force */
2469 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2470 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2471 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2473 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2474 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2475 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2479 /**************************
2480 * CALCULATE INTERACTIONS *
2481 **************************/
2483 if (gmx_mm_any_lt(rsq23,rcutoff2))
2486 r23 = _mm_mul_pd(rsq23,rinv23);
2488 /* EWALD ELECTROSTATICS */
2490 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2491 ewrt = _mm_mul_pd(r23,ewtabscale);
2492 ewitab = _mm_cvttpd_epi32(ewrt);
2494 eweps = _mm_frcz_pd(ewrt);
2496 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2498 twoeweps = _mm_add_pd(eweps,eweps);
2499 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2500 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2501 felec = _mm_mul_pd(_mm_mul_pd(qq23,rinv23),_mm_sub_pd(rinvsq23,felec));
2503 cutoff_mask = _mm_cmplt_pd(rsq23,rcutoff2);
2507 fscal = _mm_and_pd(fscal,cutoff_mask);
2509 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2511 /* Update vectorial force */
2512 fix2 = _mm_macc_pd(dx23,fscal,fix2);
2513 fiy2 = _mm_macc_pd(dy23,fscal,fiy2);
2514 fiz2 = _mm_macc_pd(dz23,fscal,fiz2);
2516 fjx3 = _mm_macc_pd(dx23,fscal,fjx3);
2517 fjy3 = _mm_macc_pd(dy23,fscal,fjy3);
2518 fjz3 = _mm_macc_pd(dz23,fscal,fjz3);
2522 /**************************
2523 * CALCULATE INTERACTIONS *
2524 **************************/
2526 if (gmx_mm_any_lt(rsq31,rcutoff2))
2529 r31 = _mm_mul_pd(rsq31,rinv31);
2531 /* EWALD ELECTROSTATICS */
2533 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2534 ewrt = _mm_mul_pd(r31,ewtabscale);
2535 ewitab = _mm_cvttpd_epi32(ewrt);
2537 eweps = _mm_frcz_pd(ewrt);
2539 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2541 twoeweps = _mm_add_pd(eweps,eweps);
2542 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2543 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2544 felec = _mm_mul_pd(_mm_mul_pd(qq31,rinv31),_mm_sub_pd(rinvsq31,felec));
2546 cutoff_mask = _mm_cmplt_pd(rsq31,rcutoff2);
2550 fscal = _mm_and_pd(fscal,cutoff_mask);
2552 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2554 /* Update vectorial force */
2555 fix3 = _mm_macc_pd(dx31,fscal,fix3);
2556 fiy3 = _mm_macc_pd(dy31,fscal,fiy3);
2557 fiz3 = _mm_macc_pd(dz31,fscal,fiz3);
2559 fjx1 = _mm_macc_pd(dx31,fscal,fjx1);
2560 fjy1 = _mm_macc_pd(dy31,fscal,fjy1);
2561 fjz1 = _mm_macc_pd(dz31,fscal,fjz1);
2565 /**************************
2566 * CALCULATE INTERACTIONS *
2567 **************************/
2569 if (gmx_mm_any_lt(rsq32,rcutoff2))
2572 r32 = _mm_mul_pd(rsq32,rinv32);
2574 /* EWALD ELECTROSTATICS */
2576 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2577 ewrt = _mm_mul_pd(r32,ewtabscale);
2578 ewitab = _mm_cvttpd_epi32(ewrt);
2580 eweps = _mm_frcz_pd(ewrt);
2582 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2584 twoeweps = _mm_add_pd(eweps,eweps);
2585 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2586 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2587 felec = _mm_mul_pd(_mm_mul_pd(qq32,rinv32),_mm_sub_pd(rinvsq32,felec));
2589 cutoff_mask = _mm_cmplt_pd(rsq32,rcutoff2);
2593 fscal = _mm_and_pd(fscal,cutoff_mask);
2595 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2597 /* Update vectorial force */
2598 fix3 = _mm_macc_pd(dx32,fscal,fix3);
2599 fiy3 = _mm_macc_pd(dy32,fscal,fiy3);
2600 fiz3 = _mm_macc_pd(dz32,fscal,fiz3);
2602 fjx2 = _mm_macc_pd(dx32,fscal,fjx2);
2603 fjy2 = _mm_macc_pd(dy32,fscal,fjy2);
2604 fjz2 = _mm_macc_pd(dz32,fscal,fjz2);
2608 /**************************
2609 * CALCULATE INTERACTIONS *
2610 **************************/
2612 if (gmx_mm_any_lt(rsq33,rcutoff2))
2615 r33 = _mm_mul_pd(rsq33,rinv33);
2617 /* EWALD ELECTROSTATICS */
2619 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2620 ewrt = _mm_mul_pd(r33,ewtabscale);
2621 ewitab = _mm_cvttpd_epi32(ewrt);
2623 eweps = _mm_frcz_pd(ewrt);
2625 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2627 twoeweps = _mm_add_pd(eweps,eweps);
2628 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2629 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2630 felec = _mm_mul_pd(_mm_mul_pd(qq33,rinv33),_mm_sub_pd(rinvsq33,felec));
2632 cutoff_mask = _mm_cmplt_pd(rsq33,rcutoff2);
2636 fscal = _mm_and_pd(fscal,cutoff_mask);
2638 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2640 /* Update vectorial force */
2641 fix3 = _mm_macc_pd(dx33,fscal,fix3);
2642 fiy3 = _mm_macc_pd(dy33,fscal,fiy3);
2643 fiz3 = _mm_macc_pd(dz33,fscal,fiz3);
2645 fjx3 = _mm_macc_pd(dx33,fscal,fjx3);
2646 fjy3 = _mm_macc_pd(dy33,fscal,fjy3);
2647 fjz3 = _mm_macc_pd(dz33,fscal,fjz3);
2651 gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2653 /* Inner loop uses 414 flops */
2656 /* End of innermost loop */
2658 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2659 f+i_coord_offset,fshift+i_shift_offset);
2661 /* Increment number of inner iterations */
2662 inneriter += j_index_end - j_index_start;
2664 /* Outer loop uses 24 flops */
2667 /* Increment number of outer iterations */
2670 /* Update outer/inner flops */
2672 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*414);