2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m128d dummy_mask,cutoff_mask;
99 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
100 __m128d one = _mm_set1_pd(1.0);
101 __m128d two = _mm_set1_pd(2.0);
107 jindex = nlist->jindex;
109 shiftidx = nlist->shift;
111 shiftvec = fr->shift_vec[0];
112 fshift = fr->fshift[0];
113 facel = _mm_set1_pd(fr->epsfac);
114 charge = mdatoms->chargeA;
115 nvdwtype = fr->ntype;
117 vdwtype = mdatoms->typeA;
119 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
120 ewtab = fr->ic->tabq_coul_FDV0;
121 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
122 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
127 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
128 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
129 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
131 jq0 = _mm_set1_pd(charge[inr+0]);
132 jq1 = _mm_set1_pd(charge[inr+1]);
133 jq2 = _mm_set1_pd(charge[inr+2]);
134 vdwjidx0A = 2*vdwtype[inr+0];
135 qq00 = _mm_mul_pd(iq0,jq0);
136 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
137 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
138 qq01 = _mm_mul_pd(iq0,jq1);
139 qq02 = _mm_mul_pd(iq0,jq2);
140 qq10 = _mm_mul_pd(iq1,jq0);
141 qq11 = _mm_mul_pd(iq1,jq1);
142 qq12 = _mm_mul_pd(iq1,jq2);
143 qq20 = _mm_mul_pd(iq2,jq0);
144 qq21 = _mm_mul_pd(iq2,jq1);
145 qq22 = _mm_mul_pd(iq2,jq2);
147 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148 rcutoff_scalar = fr->rcoulomb;
149 rcutoff = _mm_set1_pd(rcutoff_scalar);
150 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
152 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
153 rvdw = _mm_set1_pd(fr->rvdw);
155 /* Avoid stupid compiler warnings */
163 /* Start outer loop over neighborlists */
164 for(iidx=0; iidx<nri; iidx++)
166 /* Load shift vector for this list */
167 i_shift_offset = DIM*shiftidx[iidx];
169 /* Load limits for loop over neighbors */
170 j_index_start = jindex[iidx];
171 j_index_end = jindex[iidx+1];
173 /* Get outer coordinate index */
175 i_coord_offset = DIM*inr;
177 /* Load i particle coords and add shift vector */
178 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
179 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
181 fix0 = _mm_setzero_pd();
182 fiy0 = _mm_setzero_pd();
183 fiz0 = _mm_setzero_pd();
184 fix1 = _mm_setzero_pd();
185 fiy1 = _mm_setzero_pd();
186 fiz1 = _mm_setzero_pd();
187 fix2 = _mm_setzero_pd();
188 fiy2 = _mm_setzero_pd();
189 fiz2 = _mm_setzero_pd();
191 /* Reset potential sums */
192 velecsum = _mm_setzero_pd();
193 vvdwsum = _mm_setzero_pd();
195 /* Start inner kernel loop */
196 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
199 /* Get j neighbor index, and coordinate index */
202 j_coord_offsetA = DIM*jnrA;
203 j_coord_offsetB = DIM*jnrB;
205 /* load j atom coordinates */
206 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
209 /* Calculate displacement vector */
210 dx00 = _mm_sub_pd(ix0,jx0);
211 dy00 = _mm_sub_pd(iy0,jy0);
212 dz00 = _mm_sub_pd(iz0,jz0);
213 dx01 = _mm_sub_pd(ix0,jx1);
214 dy01 = _mm_sub_pd(iy0,jy1);
215 dz01 = _mm_sub_pd(iz0,jz1);
216 dx02 = _mm_sub_pd(ix0,jx2);
217 dy02 = _mm_sub_pd(iy0,jy2);
218 dz02 = _mm_sub_pd(iz0,jz2);
219 dx10 = _mm_sub_pd(ix1,jx0);
220 dy10 = _mm_sub_pd(iy1,jy0);
221 dz10 = _mm_sub_pd(iz1,jz0);
222 dx11 = _mm_sub_pd(ix1,jx1);
223 dy11 = _mm_sub_pd(iy1,jy1);
224 dz11 = _mm_sub_pd(iz1,jz1);
225 dx12 = _mm_sub_pd(ix1,jx2);
226 dy12 = _mm_sub_pd(iy1,jy2);
227 dz12 = _mm_sub_pd(iz1,jz2);
228 dx20 = _mm_sub_pd(ix2,jx0);
229 dy20 = _mm_sub_pd(iy2,jy0);
230 dz20 = _mm_sub_pd(iz2,jz0);
231 dx21 = _mm_sub_pd(ix2,jx1);
232 dy21 = _mm_sub_pd(iy2,jy1);
233 dz21 = _mm_sub_pd(iz2,jz1);
234 dx22 = _mm_sub_pd(ix2,jx2);
235 dy22 = _mm_sub_pd(iy2,jy2);
236 dz22 = _mm_sub_pd(iz2,jz2);
238 /* Calculate squared distance and things based on it */
239 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
240 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
241 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
242 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
243 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
246 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
249 rinv00 = gmx_mm_invsqrt_pd(rsq00);
250 rinv01 = gmx_mm_invsqrt_pd(rsq01);
251 rinv02 = gmx_mm_invsqrt_pd(rsq02);
252 rinv10 = gmx_mm_invsqrt_pd(rsq10);
253 rinv11 = gmx_mm_invsqrt_pd(rsq11);
254 rinv12 = gmx_mm_invsqrt_pd(rsq12);
255 rinv20 = gmx_mm_invsqrt_pd(rsq20);
256 rinv21 = gmx_mm_invsqrt_pd(rsq21);
257 rinv22 = gmx_mm_invsqrt_pd(rsq22);
259 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
260 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
261 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
262 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
263 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
264 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
265 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
266 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
267 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
269 fjx0 = _mm_setzero_pd();
270 fjy0 = _mm_setzero_pd();
271 fjz0 = _mm_setzero_pd();
272 fjx1 = _mm_setzero_pd();
273 fjy1 = _mm_setzero_pd();
274 fjz1 = _mm_setzero_pd();
275 fjx2 = _mm_setzero_pd();
276 fjy2 = _mm_setzero_pd();
277 fjz2 = _mm_setzero_pd();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 if (gmx_mm_any_lt(rsq00,rcutoff2))
286 r00 = _mm_mul_pd(rsq00,rinv00);
288 /* EWALD ELECTROSTATICS */
290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
291 ewrt = _mm_mul_pd(r00,ewtabscale);
292 ewitab = _mm_cvttpd_epi32(ewrt);
294 eweps = _mm_frcz_pd(ewrt);
296 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
298 twoeweps = _mm_add_pd(eweps,eweps);
299 ewitab = _mm_slli_epi32(ewitab,2);
300 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
301 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
302 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
303 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
304 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
305 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
306 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
307 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
308 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
309 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
311 /* LENNARD-JONES DISPERSION/REPULSION */
313 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
314 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
315 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
316 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
317 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
318 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
320 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
322 /* Update potential sum for this i atom from the interaction with this j atom. */
323 velec = _mm_and_pd(velec,cutoff_mask);
324 velecsum = _mm_add_pd(velecsum,velec);
325 vvdw = _mm_and_pd(vvdw,cutoff_mask);
326 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
328 fscal = _mm_add_pd(felec,fvdw);
330 fscal = _mm_and_pd(fscal,cutoff_mask);
332 /* Update vectorial force */
333 fix0 = _mm_macc_pd(dx00,fscal,fix0);
334 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
335 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
337 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
338 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
339 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 if (gmx_mm_any_lt(rsq01,rcutoff2))
350 r01 = _mm_mul_pd(rsq01,rinv01);
352 /* EWALD ELECTROSTATICS */
354 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
355 ewrt = _mm_mul_pd(r01,ewtabscale);
356 ewitab = _mm_cvttpd_epi32(ewrt);
358 eweps = _mm_frcz_pd(ewrt);
360 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
362 twoeweps = _mm_add_pd(eweps,eweps);
363 ewitab = _mm_slli_epi32(ewitab,2);
364 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
365 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
366 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
367 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
368 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
369 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
370 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
371 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
372 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
373 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
375 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velec = _mm_and_pd(velec,cutoff_mask);
379 velecsum = _mm_add_pd(velecsum,velec);
383 fscal = _mm_and_pd(fscal,cutoff_mask);
385 /* Update vectorial force */
386 fix0 = _mm_macc_pd(dx01,fscal,fix0);
387 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
388 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
390 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
391 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
392 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 if (gmx_mm_any_lt(rsq02,rcutoff2))
403 r02 = _mm_mul_pd(rsq02,rinv02);
405 /* EWALD ELECTROSTATICS */
407 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408 ewrt = _mm_mul_pd(r02,ewtabscale);
409 ewitab = _mm_cvttpd_epi32(ewrt);
411 eweps = _mm_frcz_pd(ewrt);
413 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
415 twoeweps = _mm_add_pd(eweps,eweps);
416 ewitab = _mm_slli_epi32(ewitab,2);
417 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
418 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
419 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
420 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
421 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
422 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
423 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
424 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
425 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
426 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
428 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velec = _mm_and_pd(velec,cutoff_mask);
432 velecsum = _mm_add_pd(velecsum,velec);
436 fscal = _mm_and_pd(fscal,cutoff_mask);
438 /* Update vectorial force */
439 fix0 = _mm_macc_pd(dx02,fscal,fix0);
440 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
441 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
443 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
444 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
445 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 if (gmx_mm_any_lt(rsq10,rcutoff2))
456 r10 = _mm_mul_pd(rsq10,rinv10);
458 /* EWALD ELECTROSTATICS */
460 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461 ewrt = _mm_mul_pd(r10,ewtabscale);
462 ewitab = _mm_cvttpd_epi32(ewrt);
464 eweps = _mm_frcz_pd(ewrt);
466 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
468 twoeweps = _mm_add_pd(eweps,eweps);
469 ewitab = _mm_slli_epi32(ewitab,2);
470 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
471 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
472 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
473 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
474 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
475 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
476 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
477 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
478 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
479 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
481 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velec = _mm_and_pd(velec,cutoff_mask);
485 velecsum = _mm_add_pd(velecsum,velec);
489 fscal = _mm_and_pd(fscal,cutoff_mask);
491 /* Update vectorial force */
492 fix1 = _mm_macc_pd(dx10,fscal,fix1);
493 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
494 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
496 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
497 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
498 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 if (gmx_mm_any_lt(rsq11,rcutoff2))
509 r11 = _mm_mul_pd(rsq11,rinv11);
511 /* EWALD ELECTROSTATICS */
513 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514 ewrt = _mm_mul_pd(r11,ewtabscale);
515 ewitab = _mm_cvttpd_epi32(ewrt);
517 eweps = _mm_frcz_pd(ewrt);
519 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
521 twoeweps = _mm_add_pd(eweps,eweps);
522 ewitab = _mm_slli_epi32(ewitab,2);
523 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
524 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
525 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
526 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
527 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
528 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
529 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
530 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
531 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
532 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
534 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velec = _mm_and_pd(velec,cutoff_mask);
538 velecsum = _mm_add_pd(velecsum,velec);
542 fscal = _mm_and_pd(fscal,cutoff_mask);
544 /* Update vectorial force */
545 fix1 = _mm_macc_pd(dx11,fscal,fix1);
546 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
547 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
549 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
550 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
551 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 if (gmx_mm_any_lt(rsq12,rcutoff2))
562 r12 = _mm_mul_pd(rsq12,rinv12);
564 /* EWALD ELECTROSTATICS */
566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567 ewrt = _mm_mul_pd(r12,ewtabscale);
568 ewitab = _mm_cvttpd_epi32(ewrt);
570 eweps = _mm_frcz_pd(ewrt);
572 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
574 twoeweps = _mm_add_pd(eweps,eweps);
575 ewitab = _mm_slli_epi32(ewitab,2);
576 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
577 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
578 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
579 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
580 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
581 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
582 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
583 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
584 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
585 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
587 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velec = _mm_and_pd(velec,cutoff_mask);
591 velecsum = _mm_add_pd(velecsum,velec);
595 fscal = _mm_and_pd(fscal,cutoff_mask);
597 /* Update vectorial force */
598 fix1 = _mm_macc_pd(dx12,fscal,fix1);
599 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
600 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
602 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
603 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
604 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 if (gmx_mm_any_lt(rsq20,rcutoff2))
615 r20 = _mm_mul_pd(rsq20,rinv20);
617 /* EWALD ELECTROSTATICS */
619 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
620 ewrt = _mm_mul_pd(r20,ewtabscale);
621 ewitab = _mm_cvttpd_epi32(ewrt);
623 eweps = _mm_frcz_pd(ewrt);
625 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
627 twoeweps = _mm_add_pd(eweps,eweps);
628 ewitab = _mm_slli_epi32(ewitab,2);
629 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
630 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
631 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
632 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
633 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
634 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
635 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
636 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
637 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
638 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
640 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
642 /* Update potential sum for this i atom from the interaction with this j atom. */
643 velec = _mm_and_pd(velec,cutoff_mask);
644 velecsum = _mm_add_pd(velecsum,velec);
648 fscal = _mm_and_pd(fscal,cutoff_mask);
650 /* Update vectorial force */
651 fix2 = _mm_macc_pd(dx20,fscal,fix2);
652 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
653 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
655 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
656 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
657 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
661 /**************************
662 * CALCULATE INTERACTIONS *
663 **************************/
665 if (gmx_mm_any_lt(rsq21,rcutoff2))
668 r21 = _mm_mul_pd(rsq21,rinv21);
670 /* EWALD ELECTROSTATICS */
672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673 ewrt = _mm_mul_pd(r21,ewtabscale);
674 ewitab = _mm_cvttpd_epi32(ewrt);
676 eweps = _mm_frcz_pd(ewrt);
678 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
680 twoeweps = _mm_add_pd(eweps,eweps);
681 ewitab = _mm_slli_epi32(ewitab,2);
682 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
683 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
684 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
685 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
686 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
687 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
688 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
689 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
690 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
691 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
693 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velec = _mm_and_pd(velec,cutoff_mask);
697 velecsum = _mm_add_pd(velecsum,velec);
701 fscal = _mm_and_pd(fscal,cutoff_mask);
703 /* Update vectorial force */
704 fix2 = _mm_macc_pd(dx21,fscal,fix2);
705 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
706 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
708 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
709 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
710 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 if (gmx_mm_any_lt(rsq22,rcutoff2))
721 r22 = _mm_mul_pd(rsq22,rinv22);
723 /* EWALD ELECTROSTATICS */
725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726 ewrt = _mm_mul_pd(r22,ewtabscale);
727 ewitab = _mm_cvttpd_epi32(ewrt);
729 eweps = _mm_frcz_pd(ewrt);
731 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
733 twoeweps = _mm_add_pd(eweps,eweps);
734 ewitab = _mm_slli_epi32(ewitab,2);
735 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
736 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
737 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
738 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
739 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
740 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
741 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
742 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
743 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
744 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
746 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
748 /* Update potential sum for this i atom from the interaction with this j atom. */
749 velec = _mm_and_pd(velec,cutoff_mask);
750 velecsum = _mm_add_pd(velecsum,velec);
754 fscal = _mm_and_pd(fscal,cutoff_mask);
756 /* Update vectorial force */
757 fix2 = _mm_macc_pd(dx22,fscal,fix2);
758 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
759 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
761 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
762 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
763 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
767 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
769 /* Inner loop uses 459 flops */
776 j_coord_offsetA = DIM*jnrA;
778 /* load j atom coordinates */
779 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
780 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
782 /* Calculate displacement vector */
783 dx00 = _mm_sub_pd(ix0,jx0);
784 dy00 = _mm_sub_pd(iy0,jy0);
785 dz00 = _mm_sub_pd(iz0,jz0);
786 dx01 = _mm_sub_pd(ix0,jx1);
787 dy01 = _mm_sub_pd(iy0,jy1);
788 dz01 = _mm_sub_pd(iz0,jz1);
789 dx02 = _mm_sub_pd(ix0,jx2);
790 dy02 = _mm_sub_pd(iy0,jy2);
791 dz02 = _mm_sub_pd(iz0,jz2);
792 dx10 = _mm_sub_pd(ix1,jx0);
793 dy10 = _mm_sub_pd(iy1,jy0);
794 dz10 = _mm_sub_pd(iz1,jz0);
795 dx11 = _mm_sub_pd(ix1,jx1);
796 dy11 = _mm_sub_pd(iy1,jy1);
797 dz11 = _mm_sub_pd(iz1,jz1);
798 dx12 = _mm_sub_pd(ix1,jx2);
799 dy12 = _mm_sub_pd(iy1,jy2);
800 dz12 = _mm_sub_pd(iz1,jz2);
801 dx20 = _mm_sub_pd(ix2,jx0);
802 dy20 = _mm_sub_pd(iy2,jy0);
803 dz20 = _mm_sub_pd(iz2,jz0);
804 dx21 = _mm_sub_pd(ix2,jx1);
805 dy21 = _mm_sub_pd(iy2,jy1);
806 dz21 = _mm_sub_pd(iz2,jz1);
807 dx22 = _mm_sub_pd(ix2,jx2);
808 dy22 = _mm_sub_pd(iy2,jy2);
809 dz22 = _mm_sub_pd(iz2,jz2);
811 /* Calculate squared distance and things based on it */
812 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
813 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
814 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
815 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
816 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
817 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
818 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
819 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
820 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
822 rinv00 = gmx_mm_invsqrt_pd(rsq00);
823 rinv01 = gmx_mm_invsqrt_pd(rsq01);
824 rinv02 = gmx_mm_invsqrt_pd(rsq02);
825 rinv10 = gmx_mm_invsqrt_pd(rsq10);
826 rinv11 = gmx_mm_invsqrt_pd(rsq11);
827 rinv12 = gmx_mm_invsqrt_pd(rsq12);
828 rinv20 = gmx_mm_invsqrt_pd(rsq20);
829 rinv21 = gmx_mm_invsqrt_pd(rsq21);
830 rinv22 = gmx_mm_invsqrt_pd(rsq22);
832 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
833 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
834 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
835 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
836 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
837 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
838 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
839 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
840 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
842 fjx0 = _mm_setzero_pd();
843 fjy0 = _mm_setzero_pd();
844 fjz0 = _mm_setzero_pd();
845 fjx1 = _mm_setzero_pd();
846 fjy1 = _mm_setzero_pd();
847 fjz1 = _mm_setzero_pd();
848 fjx2 = _mm_setzero_pd();
849 fjy2 = _mm_setzero_pd();
850 fjz2 = _mm_setzero_pd();
852 /**************************
853 * CALCULATE INTERACTIONS *
854 **************************/
856 if (gmx_mm_any_lt(rsq00,rcutoff2))
859 r00 = _mm_mul_pd(rsq00,rinv00);
861 /* EWALD ELECTROSTATICS */
863 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
864 ewrt = _mm_mul_pd(r00,ewtabscale);
865 ewitab = _mm_cvttpd_epi32(ewrt);
867 eweps = _mm_frcz_pd(ewrt);
869 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
871 twoeweps = _mm_add_pd(eweps,eweps);
872 ewitab = _mm_slli_epi32(ewitab,2);
873 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
874 ewtabD = _mm_setzero_pd();
875 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
876 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
877 ewtabFn = _mm_setzero_pd();
878 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
879 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
880 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
881 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
882 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
884 /* LENNARD-JONES DISPERSION/REPULSION */
886 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
887 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
888 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
889 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
890 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
891 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
893 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
895 /* Update potential sum for this i atom from the interaction with this j atom. */
896 velec = _mm_and_pd(velec,cutoff_mask);
897 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
898 velecsum = _mm_add_pd(velecsum,velec);
899 vvdw = _mm_and_pd(vvdw,cutoff_mask);
900 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
901 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
903 fscal = _mm_add_pd(felec,fvdw);
905 fscal = _mm_and_pd(fscal,cutoff_mask);
907 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
909 /* Update vectorial force */
910 fix0 = _mm_macc_pd(dx00,fscal,fix0);
911 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
912 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
914 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
915 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
916 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
920 /**************************
921 * CALCULATE INTERACTIONS *
922 **************************/
924 if (gmx_mm_any_lt(rsq01,rcutoff2))
927 r01 = _mm_mul_pd(rsq01,rinv01);
929 /* EWALD ELECTROSTATICS */
931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932 ewrt = _mm_mul_pd(r01,ewtabscale);
933 ewitab = _mm_cvttpd_epi32(ewrt);
935 eweps = _mm_frcz_pd(ewrt);
937 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
939 twoeweps = _mm_add_pd(eweps,eweps);
940 ewitab = _mm_slli_epi32(ewitab,2);
941 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
942 ewtabD = _mm_setzero_pd();
943 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
944 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
945 ewtabFn = _mm_setzero_pd();
946 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
947 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
948 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
949 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
950 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
952 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
954 /* Update potential sum for this i atom from the interaction with this j atom. */
955 velec = _mm_and_pd(velec,cutoff_mask);
956 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
957 velecsum = _mm_add_pd(velecsum,velec);
961 fscal = _mm_and_pd(fscal,cutoff_mask);
963 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
965 /* Update vectorial force */
966 fix0 = _mm_macc_pd(dx01,fscal,fix0);
967 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
968 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
970 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
971 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
972 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq02,rcutoff2))
983 r02 = _mm_mul_pd(rsq02,rinv02);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt = _mm_mul_pd(r02,ewtabscale);
989 ewitab = _mm_cvttpd_epi32(ewrt);
991 eweps = _mm_frcz_pd(ewrt);
993 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
995 twoeweps = _mm_add_pd(eweps,eweps);
996 ewitab = _mm_slli_epi32(ewitab,2);
997 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
998 ewtabD = _mm_setzero_pd();
999 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1000 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1001 ewtabFn = _mm_setzero_pd();
1002 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1003 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1004 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1005 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
1006 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1008 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1010 /* Update potential sum for this i atom from the interaction with this j atom. */
1011 velec = _mm_and_pd(velec,cutoff_mask);
1012 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1013 velecsum = _mm_add_pd(velecsum,velec);
1017 fscal = _mm_and_pd(fscal,cutoff_mask);
1019 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1021 /* Update vectorial force */
1022 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1023 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1024 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1026 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1027 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1028 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1032 /**************************
1033 * CALCULATE INTERACTIONS *
1034 **************************/
1036 if (gmx_mm_any_lt(rsq10,rcutoff2))
1039 r10 = _mm_mul_pd(rsq10,rinv10);
1041 /* EWALD ELECTROSTATICS */
1043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044 ewrt = _mm_mul_pd(r10,ewtabscale);
1045 ewitab = _mm_cvttpd_epi32(ewrt);
1047 eweps = _mm_frcz_pd(ewrt);
1049 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1051 twoeweps = _mm_add_pd(eweps,eweps);
1052 ewitab = _mm_slli_epi32(ewitab,2);
1053 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1054 ewtabD = _mm_setzero_pd();
1055 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1056 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1057 ewtabFn = _mm_setzero_pd();
1058 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1059 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1060 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1061 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
1062 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1064 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1066 /* Update potential sum for this i atom from the interaction with this j atom. */
1067 velec = _mm_and_pd(velec,cutoff_mask);
1068 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1069 velecsum = _mm_add_pd(velecsum,velec);
1073 fscal = _mm_and_pd(fscal,cutoff_mask);
1075 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1077 /* Update vectorial force */
1078 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1079 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1080 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1082 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1083 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1084 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1088 /**************************
1089 * CALCULATE INTERACTIONS *
1090 **************************/
1092 if (gmx_mm_any_lt(rsq11,rcutoff2))
1095 r11 = _mm_mul_pd(rsq11,rinv11);
1097 /* EWALD ELECTROSTATICS */
1099 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100 ewrt = _mm_mul_pd(r11,ewtabscale);
1101 ewitab = _mm_cvttpd_epi32(ewrt);
1103 eweps = _mm_frcz_pd(ewrt);
1105 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1107 twoeweps = _mm_add_pd(eweps,eweps);
1108 ewitab = _mm_slli_epi32(ewitab,2);
1109 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1110 ewtabD = _mm_setzero_pd();
1111 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1112 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1113 ewtabFn = _mm_setzero_pd();
1114 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1115 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1116 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1117 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1118 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1120 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1122 /* Update potential sum for this i atom from the interaction with this j atom. */
1123 velec = _mm_and_pd(velec,cutoff_mask);
1124 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1125 velecsum = _mm_add_pd(velecsum,velec);
1129 fscal = _mm_and_pd(fscal,cutoff_mask);
1131 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1133 /* Update vectorial force */
1134 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1135 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1136 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1138 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1139 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1140 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1144 /**************************
1145 * CALCULATE INTERACTIONS *
1146 **************************/
1148 if (gmx_mm_any_lt(rsq12,rcutoff2))
1151 r12 = _mm_mul_pd(rsq12,rinv12);
1153 /* EWALD ELECTROSTATICS */
1155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156 ewrt = _mm_mul_pd(r12,ewtabscale);
1157 ewitab = _mm_cvttpd_epi32(ewrt);
1159 eweps = _mm_frcz_pd(ewrt);
1161 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1163 twoeweps = _mm_add_pd(eweps,eweps);
1164 ewitab = _mm_slli_epi32(ewitab,2);
1165 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1166 ewtabD = _mm_setzero_pd();
1167 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1168 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1169 ewtabFn = _mm_setzero_pd();
1170 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1171 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1172 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1173 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1174 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1176 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm_and_pd(velec,cutoff_mask);
1180 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1181 velecsum = _mm_add_pd(velecsum,velec);
1185 fscal = _mm_and_pd(fscal,cutoff_mask);
1187 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1189 /* Update vectorial force */
1190 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1191 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1192 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1194 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1195 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1196 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 if (gmx_mm_any_lt(rsq20,rcutoff2))
1207 r20 = _mm_mul_pd(rsq20,rinv20);
1209 /* EWALD ELECTROSTATICS */
1211 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212 ewrt = _mm_mul_pd(r20,ewtabscale);
1213 ewitab = _mm_cvttpd_epi32(ewrt);
1215 eweps = _mm_frcz_pd(ewrt);
1217 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1219 twoeweps = _mm_add_pd(eweps,eweps);
1220 ewitab = _mm_slli_epi32(ewitab,2);
1221 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1222 ewtabD = _mm_setzero_pd();
1223 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1224 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1225 ewtabFn = _mm_setzero_pd();
1226 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1227 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1228 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1229 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
1230 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1232 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1234 /* Update potential sum for this i atom from the interaction with this j atom. */
1235 velec = _mm_and_pd(velec,cutoff_mask);
1236 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1237 velecsum = _mm_add_pd(velecsum,velec);
1241 fscal = _mm_and_pd(fscal,cutoff_mask);
1243 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1245 /* Update vectorial force */
1246 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1247 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1248 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1250 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1251 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1252 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1256 /**************************
1257 * CALCULATE INTERACTIONS *
1258 **************************/
1260 if (gmx_mm_any_lt(rsq21,rcutoff2))
1263 r21 = _mm_mul_pd(rsq21,rinv21);
1265 /* EWALD ELECTROSTATICS */
1267 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268 ewrt = _mm_mul_pd(r21,ewtabscale);
1269 ewitab = _mm_cvttpd_epi32(ewrt);
1271 eweps = _mm_frcz_pd(ewrt);
1273 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1275 twoeweps = _mm_add_pd(eweps,eweps);
1276 ewitab = _mm_slli_epi32(ewitab,2);
1277 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1278 ewtabD = _mm_setzero_pd();
1279 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1280 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1281 ewtabFn = _mm_setzero_pd();
1282 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1283 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1284 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1285 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1286 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1288 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1290 /* Update potential sum for this i atom from the interaction with this j atom. */
1291 velec = _mm_and_pd(velec,cutoff_mask);
1292 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1293 velecsum = _mm_add_pd(velecsum,velec);
1297 fscal = _mm_and_pd(fscal,cutoff_mask);
1299 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1301 /* Update vectorial force */
1302 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1303 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1304 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1306 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1307 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1308 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1312 /**************************
1313 * CALCULATE INTERACTIONS *
1314 **************************/
1316 if (gmx_mm_any_lt(rsq22,rcutoff2))
1319 r22 = _mm_mul_pd(rsq22,rinv22);
1321 /* EWALD ELECTROSTATICS */
1323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324 ewrt = _mm_mul_pd(r22,ewtabscale);
1325 ewitab = _mm_cvttpd_epi32(ewrt);
1327 eweps = _mm_frcz_pd(ewrt);
1329 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1331 twoeweps = _mm_add_pd(eweps,eweps);
1332 ewitab = _mm_slli_epi32(ewitab,2);
1333 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1334 ewtabD = _mm_setzero_pd();
1335 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1336 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
1337 ewtabFn = _mm_setzero_pd();
1338 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1339 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
1340 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
1341 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1342 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1344 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1346 /* Update potential sum for this i atom from the interaction with this j atom. */
1347 velec = _mm_and_pd(velec,cutoff_mask);
1348 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1349 velecsum = _mm_add_pd(velecsum,velec);
1353 fscal = _mm_and_pd(fscal,cutoff_mask);
1355 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1357 /* Update vectorial force */
1358 fix2 = _mm_macc_pd(dx22,fscal,fix2);
1359 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
1360 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
1362 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
1363 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
1364 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
1368 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1370 /* Inner loop uses 459 flops */
1373 /* End of innermost loop */
1375 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1376 f+i_coord_offset,fshift+i_shift_offset);
1379 /* Update potential energies */
1380 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1381 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1383 /* Increment number of inner iterations */
1384 inneriter += j_index_end - j_index_start;
1386 /* Outer loop uses 20 flops */
1389 /* Increment number of outer iterations */
1392 /* Update outer/inner flops */
1394 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
1397 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double
1398 * Electrostatics interaction: Ewald
1399 * VdW interaction: LennardJones
1400 * Geometry: Water3-Water3
1401 * Calculate force/pot: Force
1404 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_128_fma_double
1405 (t_nblist * gmx_restrict nlist,
1406 rvec * gmx_restrict xx,
1407 rvec * gmx_restrict ff,
1408 t_forcerec * gmx_restrict fr,
1409 t_mdatoms * gmx_restrict mdatoms,
1410 nb_kernel_data_t * gmx_restrict kernel_data,
1411 t_nrnb * gmx_restrict nrnb)
1413 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1414 * just 0 for non-waters.
1415 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1416 * jnr indices corresponding to data put in the four positions in the SIMD register.
1418 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1419 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1421 int j_coord_offsetA,j_coord_offsetB;
1422 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1423 real rcutoff_scalar;
1424 real *shiftvec,*fshift,*x,*f;
1425 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1427 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1429 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1431 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1432 int vdwjidx0A,vdwjidx0B;
1433 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1434 int vdwjidx1A,vdwjidx1B;
1435 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1436 int vdwjidx2A,vdwjidx2B;
1437 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1438 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1439 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1440 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1441 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1442 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1443 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1444 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1445 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1446 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1447 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1450 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1453 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1454 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1456 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1458 __m128d dummy_mask,cutoff_mask;
1459 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1460 __m128d one = _mm_set1_pd(1.0);
1461 __m128d two = _mm_set1_pd(2.0);
1467 jindex = nlist->jindex;
1469 shiftidx = nlist->shift;
1471 shiftvec = fr->shift_vec[0];
1472 fshift = fr->fshift[0];
1473 facel = _mm_set1_pd(fr->epsfac);
1474 charge = mdatoms->chargeA;
1475 nvdwtype = fr->ntype;
1476 vdwparam = fr->nbfp;
1477 vdwtype = mdatoms->typeA;
1479 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1480 ewtab = fr->ic->tabq_coul_F;
1481 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1482 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1484 /* Setup water-specific parameters */
1485 inr = nlist->iinr[0];
1486 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1487 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1488 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1489 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1491 jq0 = _mm_set1_pd(charge[inr+0]);
1492 jq1 = _mm_set1_pd(charge[inr+1]);
1493 jq2 = _mm_set1_pd(charge[inr+2]);
1494 vdwjidx0A = 2*vdwtype[inr+0];
1495 qq00 = _mm_mul_pd(iq0,jq0);
1496 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1497 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1498 qq01 = _mm_mul_pd(iq0,jq1);
1499 qq02 = _mm_mul_pd(iq0,jq2);
1500 qq10 = _mm_mul_pd(iq1,jq0);
1501 qq11 = _mm_mul_pd(iq1,jq1);
1502 qq12 = _mm_mul_pd(iq1,jq2);
1503 qq20 = _mm_mul_pd(iq2,jq0);
1504 qq21 = _mm_mul_pd(iq2,jq1);
1505 qq22 = _mm_mul_pd(iq2,jq2);
1507 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1508 rcutoff_scalar = fr->rcoulomb;
1509 rcutoff = _mm_set1_pd(rcutoff_scalar);
1510 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1512 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1513 rvdw = _mm_set1_pd(fr->rvdw);
1515 /* Avoid stupid compiler warnings */
1517 j_coord_offsetA = 0;
1518 j_coord_offsetB = 0;
1523 /* Start outer loop over neighborlists */
1524 for(iidx=0; iidx<nri; iidx++)
1526 /* Load shift vector for this list */
1527 i_shift_offset = DIM*shiftidx[iidx];
1529 /* Load limits for loop over neighbors */
1530 j_index_start = jindex[iidx];
1531 j_index_end = jindex[iidx+1];
1533 /* Get outer coordinate index */
1535 i_coord_offset = DIM*inr;
1537 /* Load i particle coords and add shift vector */
1538 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1539 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1541 fix0 = _mm_setzero_pd();
1542 fiy0 = _mm_setzero_pd();
1543 fiz0 = _mm_setzero_pd();
1544 fix1 = _mm_setzero_pd();
1545 fiy1 = _mm_setzero_pd();
1546 fiz1 = _mm_setzero_pd();
1547 fix2 = _mm_setzero_pd();
1548 fiy2 = _mm_setzero_pd();
1549 fiz2 = _mm_setzero_pd();
1551 /* Start inner kernel loop */
1552 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1555 /* Get j neighbor index, and coordinate index */
1557 jnrB = jjnr[jidx+1];
1558 j_coord_offsetA = DIM*jnrA;
1559 j_coord_offsetB = DIM*jnrB;
1561 /* load j atom coordinates */
1562 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1563 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1565 /* Calculate displacement vector */
1566 dx00 = _mm_sub_pd(ix0,jx0);
1567 dy00 = _mm_sub_pd(iy0,jy0);
1568 dz00 = _mm_sub_pd(iz0,jz0);
1569 dx01 = _mm_sub_pd(ix0,jx1);
1570 dy01 = _mm_sub_pd(iy0,jy1);
1571 dz01 = _mm_sub_pd(iz0,jz1);
1572 dx02 = _mm_sub_pd(ix0,jx2);
1573 dy02 = _mm_sub_pd(iy0,jy2);
1574 dz02 = _mm_sub_pd(iz0,jz2);
1575 dx10 = _mm_sub_pd(ix1,jx0);
1576 dy10 = _mm_sub_pd(iy1,jy0);
1577 dz10 = _mm_sub_pd(iz1,jz0);
1578 dx11 = _mm_sub_pd(ix1,jx1);
1579 dy11 = _mm_sub_pd(iy1,jy1);
1580 dz11 = _mm_sub_pd(iz1,jz1);
1581 dx12 = _mm_sub_pd(ix1,jx2);
1582 dy12 = _mm_sub_pd(iy1,jy2);
1583 dz12 = _mm_sub_pd(iz1,jz2);
1584 dx20 = _mm_sub_pd(ix2,jx0);
1585 dy20 = _mm_sub_pd(iy2,jy0);
1586 dz20 = _mm_sub_pd(iz2,jz0);
1587 dx21 = _mm_sub_pd(ix2,jx1);
1588 dy21 = _mm_sub_pd(iy2,jy1);
1589 dz21 = _mm_sub_pd(iz2,jz1);
1590 dx22 = _mm_sub_pd(ix2,jx2);
1591 dy22 = _mm_sub_pd(iy2,jy2);
1592 dz22 = _mm_sub_pd(iz2,jz2);
1594 /* Calculate squared distance and things based on it */
1595 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1596 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1597 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1598 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1599 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1600 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1601 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1602 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1603 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1605 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1606 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1607 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1608 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1609 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1610 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1611 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1612 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1613 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1615 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1616 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1617 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1618 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1619 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1620 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1621 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1622 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1623 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1625 fjx0 = _mm_setzero_pd();
1626 fjy0 = _mm_setzero_pd();
1627 fjz0 = _mm_setzero_pd();
1628 fjx1 = _mm_setzero_pd();
1629 fjy1 = _mm_setzero_pd();
1630 fjz1 = _mm_setzero_pd();
1631 fjx2 = _mm_setzero_pd();
1632 fjy2 = _mm_setzero_pd();
1633 fjz2 = _mm_setzero_pd();
1635 /**************************
1636 * CALCULATE INTERACTIONS *
1637 **************************/
1639 if (gmx_mm_any_lt(rsq00,rcutoff2))
1642 r00 = _mm_mul_pd(rsq00,rinv00);
1644 /* EWALD ELECTROSTATICS */
1646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1647 ewrt = _mm_mul_pd(r00,ewtabscale);
1648 ewitab = _mm_cvttpd_epi32(ewrt);
1650 eweps = _mm_frcz_pd(ewrt);
1652 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1654 twoeweps = _mm_add_pd(eweps,eweps);
1655 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1657 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1658 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1660 /* LENNARD-JONES DISPERSION/REPULSION */
1662 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1663 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1665 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1667 fscal = _mm_add_pd(felec,fvdw);
1669 fscal = _mm_and_pd(fscal,cutoff_mask);
1671 /* Update vectorial force */
1672 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1673 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1674 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1676 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1677 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1678 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 if (gmx_mm_any_lt(rsq01,rcutoff2))
1689 r01 = _mm_mul_pd(rsq01,rinv01);
1691 /* EWALD ELECTROSTATICS */
1693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1694 ewrt = _mm_mul_pd(r01,ewtabscale);
1695 ewitab = _mm_cvttpd_epi32(ewrt);
1697 eweps = _mm_frcz_pd(ewrt);
1699 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1701 twoeweps = _mm_add_pd(eweps,eweps);
1702 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1704 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1705 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1707 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1711 fscal = _mm_and_pd(fscal,cutoff_mask);
1713 /* Update vectorial force */
1714 fix0 = _mm_macc_pd(dx01,fscal,fix0);
1715 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
1716 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
1718 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
1719 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
1720 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
1724 /**************************
1725 * CALCULATE INTERACTIONS *
1726 **************************/
1728 if (gmx_mm_any_lt(rsq02,rcutoff2))
1731 r02 = _mm_mul_pd(rsq02,rinv02);
1733 /* EWALD ELECTROSTATICS */
1735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736 ewrt = _mm_mul_pd(r02,ewtabscale);
1737 ewitab = _mm_cvttpd_epi32(ewrt);
1739 eweps = _mm_frcz_pd(ewrt);
1741 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1743 twoeweps = _mm_add_pd(eweps,eweps);
1744 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1746 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1747 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1749 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1753 fscal = _mm_and_pd(fscal,cutoff_mask);
1755 /* Update vectorial force */
1756 fix0 = _mm_macc_pd(dx02,fscal,fix0);
1757 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
1758 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
1760 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
1761 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
1762 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 if (gmx_mm_any_lt(rsq10,rcutoff2))
1773 r10 = _mm_mul_pd(rsq10,rinv10);
1775 /* EWALD ELECTROSTATICS */
1777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778 ewrt = _mm_mul_pd(r10,ewtabscale);
1779 ewitab = _mm_cvttpd_epi32(ewrt);
1781 eweps = _mm_frcz_pd(ewrt);
1783 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1785 twoeweps = _mm_add_pd(eweps,eweps);
1786 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1788 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1789 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1791 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1795 fscal = _mm_and_pd(fscal,cutoff_mask);
1797 /* Update vectorial force */
1798 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1799 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1800 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1802 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1803 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1804 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 if (gmx_mm_any_lt(rsq11,rcutoff2))
1815 r11 = _mm_mul_pd(rsq11,rinv11);
1817 /* EWALD ELECTROSTATICS */
1819 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1820 ewrt = _mm_mul_pd(r11,ewtabscale);
1821 ewitab = _mm_cvttpd_epi32(ewrt);
1823 eweps = _mm_frcz_pd(ewrt);
1825 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1827 twoeweps = _mm_add_pd(eweps,eweps);
1828 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1830 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1831 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1833 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1837 fscal = _mm_and_pd(fscal,cutoff_mask);
1839 /* Update vectorial force */
1840 fix1 = _mm_macc_pd(dx11,fscal,fix1);
1841 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
1842 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
1844 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
1845 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
1846 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
1850 /**************************
1851 * CALCULATE INTERACTIONS *
1852 **************************/
1854 if (gmx_mm_any_lt(rsq12,rcutoff2))
1857 r12 = _mm_mul_pd(rsq12,rinv12);
1859 /* EWALD ELECTROSTATICS */
1861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862 ewrt = _mm_mul_pd(r12,ewtabscale);
1863 ewitab = _mm_cvttpd_epi32(ewrt);
1865 eweps = _mm_frcz_pd(ewrt);
1867 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1869 twoeweps = _mm_add_pd(eweps,eweps);
1870 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1872 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1873 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1875 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1879 fscal = _mm_and_pd(fscal,cutoff_mask);
1881 /* Update vectorial force */
1882 fix1 = _mm_macc_pd(dx12,fscal,fix1);
1883 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
1884 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
1886 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
1887 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
1888 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 if (gmx_mm_any_lt(rsq20,rcutoff2))
1899 r20 = _mm_mul_pd(rsq20,rinv20);
1901 /* EWALD ELECTROSTATICS */
1903 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1904 ewrt = _mm_mul_pd(r20,ewtabscale);
1905 ewitab = _mm_cvttpd_epi32(ewrt);
1907 eweps = _mm_frcz_pd(ewrt);
1909 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1911 twoeweps = _mm_add_pd(eweps,eweps);
1912 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1914 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1915 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1917 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1921 fscal = _mm_and_pd(fscal,cutoff_mask);
1923 /* Update vectorial force */
1924 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1925 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1926 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1928 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1929 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1930 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1934 /**************************
1935 * CALCULATE INTERACTIONS *
1936 **************************/
1938 if (gmx_mm_any_lt(rsq21,rcutoff2))
1941 r21 = _mm_mul_pd(rsq21,rinv21);
1943 /* EWALD ELECTROSTATICS */
1945 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1946 ewrt = _mm_mul_pd(r21,ewtabscale);
1947 ewitab = _mm_cvttpd_epi32(ewrt);
1949 eweps = _mm_frcz_pd(ewrt);
1951 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1953 twoeweps = _mm_add_pd(eweps,eweps);
1954 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1956 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1957 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1959 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1963 fscal = _mm_and_pd(fscal,cutoff_mask);
1965 /* Update vectorial force */
1966 fix2 = _mm_macc_pd(dx21,fscal,fix2);
1967 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
1968 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
1970 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
1971 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
1972 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 if (gmx_mm_any_lt(rsq22,rcutoff2))
1983 r22 = _mm_mul_pd(rsq22,rinv22);
1985 /* EWALD ELECTROSTATICS */
1987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988 ewrt = _mm_mul_pd(r22,ewtabscale);
1989 ewitab = _mm_cvttpd_epi32(ewrt);
1991 eweps = _mm_frcz_pd(ewrt);
1993 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1995 twoeweps = _mm_add_pd(eweps,eweps);
1996 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1998 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1999 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2001 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2005 fscal = _mm_and_pd(fscal,cutoff_mask);
2007 /* Update vectorial force */
2008 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2009 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2010 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2012 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2013 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2014 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2018 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2020 /* Inner loop uses 385 flops */
2023 if(jidx<j_index_end)
2027 j_coord_offsetA = DIM*jnrA;
2029 /* load j atom coordinates */
2030 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2031 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2033 /* Calculate displacement vector */
2034 dx00 = _mm_sub_pd(ix0,jx0);
2035 dy00 = _mm_sub_pd(iy0,jy0);
2036 dz00 = _mm_sub_pd(iz0,jz0);
2037 dx01 = _mm_sub_pd(ix0,jx1);
2038 dy01 = _mm_sub_pd(iy0,jy1);
2039 dz01 = _mm_sub_pd(iz0,jz1);
2040 dx02 = _mm_sub_pd(ix0,jx2);
2041 dy02 = _mm_sub_pd(iy0,jy2);
2042 dz02 = _mm_sub_pd(iz0,jz2);
2043 dx10 = _mm_sub_pd(ix1,jx0);
2044 dy10 = _mm_sub_pd(iy1,jy0);
2045 dz10 = _mm_sub_pd(iz1,jz0);
2046 dx11 = _mm_sub_pd(ix1,jx1);
2047 dy11 = _mm_sub_pd(iy1,jy1);
2048 dz11 = _mm_sub_pd(iz1,jz1);
2049 dx12 = _mm_sub_pd(ix1,jx2);
2050 dy12 = _mm_sub_pd(iy1,jy2);
2051 dz12 = _mm_sub_pd(iz1,jz2);
2052 dx20 = _mm_sub_pd(ix2,jx0);
2053 dy20 = _mm_sub_pd(iy2,jy0);
2054 dz20 = _mm_sub_pd(iz2,jz0);
2055 dx21 = _mm_sub_pd(ix2,jx1);
2056 dy21 = _mm_sub_pd(iy2,jy1);
2057 dz21 = _mm_sub_pd(iz2,jz1);
2058 dx22 = _mm_sub_pd(ix2,jx2);
2059 dy22 = _mm_sub_pd(iy2,jy2);
2060 dz22 = _mm_sub_pd(iz2,jz2);
2062 /* Calculate squared distance and things based on it */
2063 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2064 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2065 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2066 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2067 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2068 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2069 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2070 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2071 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2073 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2074 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2075 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2076 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2077 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2078 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2079 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2080 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2081 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2083 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2084 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2085 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2086 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2087 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2088 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2089 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2090 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2091 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2093 fjx0 = _mm_setzero_pd();
2094 fjy0 = _mm_setzero_pd();
2095 fjz0 = _mm_setzero_pd();
2096 fjx1 = _mm_setzero_pd();
2097 fjy1 = _mm_setzero_pd();
2098 fjz1 = _mm_setzero_pd();
2099 fjx2 = _mm_setzero_pd();
2100 fjy2 = _mm_setzero_pd();
2101 fjz2 = _mm_setzero_pd();
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 if (gmx_mm_any_lt(rsq00,rcutoff2))
2110 r00 = _mm_mul_pd(rsq00,rinv00);
2112 /* EWALD ELECTROSTATICS */
2114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2115 ewrt = _mm_mul_pd(r00,ewtabscale);
2116 ewitab = _mm_cvttpd_epi32(ewrt);
2118 eweps = _mm_frcz_pd(ewrt);
2120 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2122 twoeweps = _mm_add_pd(eweps,eweps);
2123 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2124 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2125 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2127 /* LENNARD-JONES DISPERSION/REPULSION */
2129 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2130 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2132 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2134 fscal = _mm_add_pd(felec,fvdw);
2136 fscal = _mm_and_pd(fscal,cutoff_mask);
2138 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2140 /* Update vectorial force */
2141 fix0 = _mm_macc_pd(dx00,fscal,fix0);
2142 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
2143 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
2145 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
2146 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
2147 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 if (gmx_mm_any_lt(rsq01,rcutoff2))
2158 r01 = _mm_mul_pd(rsq01,rinv01);
2160 /* EWALD ELECTROSTATICS */
2162 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2163 ewrt = _mm_mul_pd(r01,ewtabscale);
2164 ewitab = _mm_cvttpd_epi32(ewrt);
2166 eweps = _mm_frcz_pd(ewrt);
2168 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2170 twoeweps = _mm_add_pd(eweps,eweps);
2171 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2172 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2173 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2175 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2179 fscal = _mm_and_pd(fscal,cutoff_mask);
2181 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2183 /* Update vectorial force */
2184 fix0 = _mm_macc_pd(dx01,fscal,fix0);
2185 fiy0 = _mm_macc_pd(dy01,fscal,fiy0);
2186 fiz0 = _mm_macc_pd(dz01,fscal,fiz0);
2188 fjx1 = _mm_macc_pd(dx01,fscal,fjx1);
2189 fjy1 = _mm_macc_pd(dy01,fscal,fjy1);
2190 fjz1 = _mm_macc_pd(dz01,fscal,fjz1);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 if (gmx_mm_any_lt(rsq02,rcutoff2))
2201 r02 = _mm_mul_pd(rsq02,rinv02);
2203 /* EWALD ELECTROSTATICS */
2205 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2206 ewrt = _mm_mul_pd(r02,ewtabscale);
2207 ewitab = _mm_cvttpd_epi32(ewrt);
2209 eweps = _mm_frcz_pd(ewrt);
2211 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2213 twoeweps = _mm_add_pd(eweps,eweps);
2214 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2215 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2216 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2218 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2222 fscal = _mm_and_pd(fscal,cutoff_mask);
2224 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2226 /* Update vectorial force */
2227 fix0 = _mm_macc_pd(dx02,fscal,fix0);
2228 fiy0 = _mm_macc_pd(dy02,fscal,fiy0);
2229 fiz0 = _mm_macc_pd(dz02,fscal,fiz0);
2231 fjx2 = _mm_macc_pd(dx02,fscal,fjx2);
2232 fjy2 = _mm_macc_pd(dy02,fscal,fjy2);
2233 fjz2 = _mm_macc_pd(dz02,fscal,fjz2);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 if (gmx_mm_any_lt(rsq10,rcutoff2))
2244 r10 = _mm_mul_pd(rsq10,rinv10);
2246 /* EWALD ELECTROSTATICS */
2248 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2249 ewrt = _mm_mul_pd(r10,ewtabscale);
2250 ewitab = _mm_cvttpd_epi32(ewrt);
2252 eweps = _mm_frcz_pd(ewrt);
2254 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2256 twoeweps = _mm_add_pd(eweps,eweps);
2257 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2258 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2259 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2261 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2265 fscal = _mm_and_pd(fscal,cutoff_mask);
2267 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2269 /* Update vectorial force */
2270 fix1 = _mm_macc_pd(dx10,fscal,fix1);
2271 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
2272 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
2274 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
2275 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
2276 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2284 if (gmx_mm_any_lt(rsq11,rcutoff2))
2287 r11 = _mm_mul_pd(rsq11,rinv11);
2289 /* EWALD ELECTROSTATICS */
2291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292 ewrt = _mm_mul_pd(r11,ewtabscale);
2293 ewitab = _mm_cvttpd_epi32(ewrt);
2295 eweps = _mm_frcz_pd(ewrt);
2297 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2299 twoeweps = _mm_add_pd(eweps,eweps);
2300 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2301 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2302 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2304 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2308 fscal = _mm_and_pd(fscal,cutoff_mask);
2310 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2312 /* Update vectorial force */
2313 fix1 = _mm_macc_pd(dx11,fscal,fix1);
2314 fiy1 = _mm_macc_pd(dy11,fscal,fiy1);
2315 fiz1 = _mm_macc_pd(dz11,fscal,fiz1);
2317 fjx1 = _mm_macc_pd(dx11,fscal,fjx1);
2318 fjy1 = _mm_macc_pd(dy11,fscal,fjy1);
2319 fjz1 = _mm_macc_pd(dz11,fscal,fjz1);
2323 /**************************
2324 * CALCULATE INTERACTIONS *
2325 **************************/
2327 if (gmx_mm_any_lt(rsq12,rcutoff2))
2330 r12 = _mm_mul_pd(rsq12,rinv12);
2332 /* EWALD ELECTROSTATICS */
2334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335 ewrt = _mm_mul_pd(r12,ewtabscale);
2336 ewitab = _mm_cvttpd_epi32(ewrt);
2338 eweps = _mm_frcz_pd(ewrt);
2340 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2342 twoeweps = _mm_add_pd(eweps,eweps);
2343 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2344 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2345 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2347 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2351 fscal = _mm_and_pd(fscal,cutoff_mask);
2353 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2355 /* Update vectorial force */
2356 fix1 = _mm_macc_pd(dx12,fscal,fix1);
2357 fiy1 = _mm_macc_pd(dy12,fscal,fiy1);
2358 fiz1 = _mm_macc_pd(dz12,fscal,fiz1);
2360 fjx2 = _mm_macc_pd(dx12,fscal,fjx2);
2361 fjy2 = _mm_macc_pd(dy12,fscal,fjy2);
2362 fjz2 = _mm_macc_pd(dz12,fscal,fjz2);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 if (gmx_mm_any_lt(rsq20,rcutoff2))
2373 r20 = _mm_mul_pd(rsq20,rinv20);
2375 /* EWALD ELECTROSTATICS */
2377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378 ewrt = _mm_mul_pd(r20,ewtabscale);
2379 ewitab = _mm_cvttpd_epi32(ewrt);
2381 eweps = _mm_frcz_pd(ewrt);
2383 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2385 twoeweps = _mm_add_pd(eweps,eweps);
2386 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2387 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2388 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2390 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2394 fscal = _mm_and_pd(fscal,cutoff_mask);
2396 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2398 /* Update vectorial force */
2399 fix2 = _mm_macc_pd(dx20,fscal,fix2);
2400 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
2401 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
2403 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
2404 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
2405 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
2409 /**************************
2410 * CALCULATE INTERACTIONS *
2411 **************************/
2413 if (gmx_mm_any_lt(rsq21,rcutoff2))
2416 r21 = _mm_mul_pd(rsq21,rinv21);
2418 /* EWALD ELECTROSTATICS */
2420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2421 ewrt = _mm_mul_pd(r21,ewtabscale);
2422 ewitab = _mm_cvttpd_epi32(ewrt);
2424 eweps = _mm_frcz_pd(ewrt);
2426 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2428 twoeweps = _mm_add_pd(eweps,eweps);
2429 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2430 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2431 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2433 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2437 fscal = _mm_and_pd(fscal,cutoff_mask);
2439 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2441 /* Update vectorial force */
2442 fix2 = _mm_macc_pd(dx21,fscal,fix2);
2443 fiy2 = _mm_macc_pd(dy21,fscal,fiy2);
2444 fiz2 = _mm_macc_pd(dz21,fscal,fiz2);
2446 fjx1 = _mm_macc_pd(dx21,fscal,fjx1);
2447 fjy1 = _mm_macc_pd(dy21,fscal,fjy1);
2448 fjz1 = _mm_macc_pd(dz21,fscal,fjz1);
2452 /**************************
2453 * CALCULATE INTERACTIONS *
2454 **************************/
2456 if (gmx_mm_any_lt(rsq22,rcutoff2))
2459 r22 = _mm_mul_pd(rsq22,rinv22);
2461 /* EWALD ELECTROSTATICS */
2463 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464 ewrt = _mm_mul_pd(r22,ewtabscale);
2465 ewitab = _mm_cvttpd_epi32(ewrt);
2467 eweps = _mm_frcz_pd(ewrt);
2469 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2471 twoeweps = _mm_add_pd(eweps,eweps);
2472 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2473 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
2474 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2476 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2480 fscal = _mm_and_pd(fscal,cutoff_mask);
2482 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2484 /* Update vectorial force */
2485 fix2 = _mm_macc_pd(dx22,fscal,fix2);
2486 fiy2 = _mm_macc_pd(dy22,fscal,fiy2);
2487 fiz2 = _mm_macc_pd(dz22,fscal,fiz2);
2489 fjx2 = _mm_macc_pd(dx22,fscal,fjx2);
2490 fjy2 = _mm_macc_pd(dy22,fscal,fjy2);
2491 fjz2 = _mm_macc_pd(dz22,fscal,fjz2);
2495 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2497 /* Inner loop uses 385 flops */
2500 /* End of innermost loop */
2502 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2503 f+i_coord_offset,fshift+i_shift_offset);
2505 /* Increment number of inner iterations */
2506 inneriter += j_index_end - j_index_start;
2508 /* Outer loop uses 18 flops */
2511 /* Increment number of outer iterations */
2514 /* Update outer/inner flops */
2516 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*385);