2 * Note: this file was generated by the Gromacs sse4_1_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_double.h"
34 #include "kernelutil_x86_sse4_1_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse4_1_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B;
73 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B;
75 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B;
77 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
94 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
96 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
98 __m128d dummy_mask,cutoff_mask;
99 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
100 __m128d one = _mm_set1_pd(1.0);
101 __m128d two = _mm_set1_pd(2.0);
107 jindex = nlist->jindex;
109 shiftidx = nlist->shift;
111 shiftvec = fr->shift_vec[0];
112 fshift = fr->fshift[0];
113 facel = _mm_set1_pd(fr->epsfac);
114 charge = mdatoms->chargeA;
115 nvdwtype = fr->ntype;
117 vdwtype = mdatoms->typeA;
119 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
120 ewtab = fr->ic->tabq_coul_FDV0;
121 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
122 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
127 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
128 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
129 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
131 jq0 = _mm_set1_pd(charge[inr+0]);
132 jq1 = _mm_set1_pd(charge[inr+1]);
133 jq2 = _mm_set1_pd(charge[inr+2]);
134 vdwjidx0A = 2*vdwtype[inr+0];
135 qq00 = _mm_mul_pd(iq0,jq0);
136 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
137 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
138 qq01 = _mm_mul_pd(iq0,jq1);
139 qq02 = _mm_mul_pd(iq0,jq2);
140 qq10 = _mm_mul_pd(iq1,jq0);
141 qq11 = _mm_mul_pd(iq1,jq1);
142 qq12 = _mm_mul_pd(iq1,jq2);
143 qq20 = _mm_mul_pd(iq2,jq0);
144 qq21 = _mm_mul_pd(iq2,jq1);
145 qq22 = _mm_mul_pd(iq2,jq2);
147 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
148 rcutoff_scalar = fr->rcoulomb;
149 rcutoff = _mm_set1_pd(rcutoff_scalar);
150 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
152 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
153 rvdw = _mm_set1_pd(fr->rvdw);
155 /* Avoid stupid compiler warnings */
163 /* Start outer loop over neighborlists */
164 for(iidx=0; iidx<nri; iidx++)
166 /* Load shift vector for this list */
167 i_shift_offset = DIM*shiftidx[iidx];
169 /* Load limits for loop over neighbors */
170 j_index_start = jindex[iidx];
171 j_index_end = jindex[iidx+1];
173 /* Get outer coordinate index */
175 i_coord_offset = DIM*inr;
177 /* Load i particle coords and add shift vector */
178 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
179 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
181 fix0 = _mm_setzero_pd();
182 fiy0 = _mm_setzero_pd();
183 fiz0 = _mm_setzero_pd();
184 fix1 = _mm_setzero_pd();
185 fiy1 = _mm_setzero_pd();
186 fiz1 = _mm_setzero_pd();
187 fix2 = _mm_setzero_pd();
188 fiy2 = _mm_setzero_pd();
189 fiz2 = _mm_setzero_pd();
191 /* Reset potential sums */
192 velecsum = _mm_setzero_pd();
193 vvdwsum = _mm_setzero_pd();
195 /* Start inner kernel loop */
196 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
199 /* Get j neighbor index, and coordinate index */
202 j_coord_offsetA = DIM*jnrA;
203 j_coord_offsetB = DIM*jnrB;
205 /* load j atom coordinates */
206 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
209 /* Calculate displacement vector */
210 dx00 = _mm_sub_pd(ix0,jx0);
211 dy00 = _mm_sub_pd(iy0,jy0);
212 dz00 = _mm_sub_pd(iz0,jz0);
213 dx01 = _mm_sub_pd(ix0,jx1);
214 dy01 = _mm_sub_pd(iy0,jy1);
215 dz01 = _mm_sub_pd(iz0,jz1);
216 dx02 = _mm_sub_pd(ix0,jx2);
217 dy02 = _mm_sub_pd(iy0,jy2);
218 dz02 = _mm_sub_pd(iz0,jz2);
219 dx10 = _mm_sub_pd(ix1,jx0);
220 dy10 = _mm_sub_pd(iy1,jy0);
221 dz10 = _mm_sub_pd(iz1,jz0);
222 dx11 = _mm_sub_pd(ix1,jx1);
223 dy11 = _mm_sub_pd(iy1,jy1);
224 dz11 = _mm_sub_pd(iz1,jz1);
225 dx12 = _mm_sub_pd(ix1,jx2);
226 dy12 = _mm_sub_pd(iy1,jy2);
227 dz12 = _mm_sub_pd(iz1,jz2);
228 dx20 = _mm_sub_pd(ix2,jx0);
229 dy20 = _mm_sub_pd(iy2,jy0);
230 dz20 = _mm_sub_pd(iz2,jz0);
231 dx21 = _mm_sub_pd(ix2,jx1);
232 dy21 = _mm_sub_pd(iy2,jy1);
233 dz21 = _mm_sub_pd(iz2,jz1);
234 dx22 = _mm_sub_pd(ix2,jx2);
235 dy22 = _mm_sub_pd(iy2,jy2);
236 dz22 = _mm_sub_pd(iz2,jz2);
238 /* Calculate squared distance and things based on it */
239 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
240 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
241 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
242 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
243 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
244 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
245 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
246 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
247 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
249 rinv00 = gmx_mm_invsqrt_pd(rsq00);
250 rinv01 = gmx_mm_invsqrt_pd(rsq01);
251 rinv02 = gmx_mm_invsqrt_pd(rsq02);
252 rinv10 = gmx_mm_invsqrt_pd(rsq10);
253 rinv11 = gmx_mm_invsqrt_pd(rsq11);
254 rinv12 = gmx_mm_invsqrt_pd(rsq12);
255 rinv20 = gmx_mm_invsqrt_pd(rsq20);
256 rinv21 = gmx_mm_invsqrt_pd(rsq21);
257 rinv22 = gmx_mm_invsqrt_pd(rsq22);
259 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
260 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
261 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
262 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
263 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
264 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
265 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
266 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
267 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
269 fjx0 = _mm_setzero_pd();
270 fjy0 = _mm_setzero_pd();
271 fjz0 = _mm_setzero_pd();
272 fjx1 = _mm_setzero_pd();
273 fjy1 = _mm_setzero_pd();
274 fjz1 = _mm_setzero_pd();
275 fjx2 = _mm_setzero_pd();
276 fjy2 = _mm_setzero_pd();
277 fjz2 = _mm_setzero_pd();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 if (gmx_mm_any_lt(rsq00,rcutoff2))
286 r00 = _mm_mul_pd(rsq00,rinv00);
288 /* EWALD ELECTROSTATICS */
290 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
291 ewrt = _mm_mul_pd(r00,ewtabscale);
292 ewitab = _mm_cvttpd_epi32(ewrt);
293 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
294 ewitab = _mm_slli_epi32(ewitab,2);
295 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
296 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
297 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
298 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
299 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
300 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
301 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
302 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
303 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
304 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
306 /* LENNARD-JONES DISPERSION/REPULSION */
308 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
309 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
310 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
311 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
312 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
313 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
315 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
317 /* Update potential sum for this i atom from the interaction with this j atom. */
318 velec = _mm_and_pd(velec,cutoff_mask);
319 velecsum = _mm_add_pd(velecsum,velec);
320 vvdw = _mm_and_pd(vvdw,cutoff_mask);
321 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
323 fscal = _mm_add_pd(felec,fvdw);
325 fscal = _mm_and_pd(fscal,cutoff_mask);
327 /* Calculate temporary vectorial force */
328 tx = _mm_mul_pd(fscal,dx00);
329 ty = _mm_mul_pd(fscal,dy00);
330 tz = _mm_mul_pd(fscal,dz00);
332 /* Update vectorial force */
333 fix0 = _mm_add_pd(fix0,tx);
334 fiy0 = _mm_add_pd(fiy0,ty);
335 fiz0 = _mm_add_pd(fiz0,tz);
337 fjx0 = _mm_add_pd(fjx0,tx);
338 fjy0 = _mm_add_pd(fjy0,ty);
339 fjz0 = _mm_add_pd(fjz0,tz);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 if (gmx_mm_any_lt(rsq01,rcutoff2))
350 r01 = _mm_mul_pd(rsq01,rinv01);
352 /* EWALD ELECTROSTATICS */
354 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
355 ewrt = _mm_mul_pd(r01,ewtabscale);
356 ewitab = _mm_cvttpd_epi32(ewrt);
357 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
358 ewitab = _mm_slli_epi32(ewitab,2);
359 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
360 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
361 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
362 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
363 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
364 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
365 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
366 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
367 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
368 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
370 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
372 /* Update potential sum for this i atom from the interaction with this j atom. */
373 velec = _mm_and_pd(velec,cutoff_mask);
374 velecsum = _mm_add_pd(velecsum,velec);
378 fscal = _mm_and_pd(fscal,cutoff_mask);
380 /* Calculate temporary vectorial force */
381 tx = _mm_mul_pd(fscal,dx01);
382 ty = _mm_mul_pd(fscal,dy01);
383 tz = _mm_mul_pd(fscal,dz01);
385 /* Update vectorial force */
386 fix0 = _mm_add_pd(fix0,tx);
387 fiy0 = _mm_add_pd(fiy0,ty);
388 fiz0 = _mm_add_pd(fiz0,tz);
390 fjx1 = _mm_add_pd(fjx1,tx);
391 fjy1 = _mm_add_pd(fjy1,ty);
392 fjz1 = _mm_add_pd(fjz1,tz);
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 if (gmx_mm_any_lt(rsq02,rcutoff2))
403 r02 = _mm_mul_pd(rsq02,rinv02);
405 /* EWALD ELECTROSTATICS */
407 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
408 ewrt = _mm_mul_pd(r02,ewtabscale);
409 ewitab = _mm_cvttpd_epi32(ewrt);
410 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
411 ewitab = _mm_slli_epi32(ewitab,2);
412 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
413 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
414 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
415 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
416 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
417 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
418 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
419 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
420 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
421 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
423 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
425 /* Update potential sum for this i atom from the interaction with this j atom. */
426 velec = _mm_and_pd(velec,cutoff_mask);
427 velecsum = _mm_add_pd(velecsum,velec);
431 fscal = _mm_and_pd(fscal,cutoff_mask);
433 /* Calculate temporary vectorial force */
434 tx = _mm_mul_pd(fscal,dx02);
435 ty = _mm_mul_pd(fscal,dy02);
436 tz = _mm_mul_pd(fscal,dz02);
438 /* Update vectorial force */
439 fix0 = _mm_add_pd(fix0,tx);
440 fiy0 = _mm_add_pd(fiy0,ty);
441 fiz0 = _mm_add_pd(fiz0,tz);
443 fjx2 = _mm_add_pd(fjx2,tx);
444 fjy2 = _mm_add_pd(fjy2,ty);
445 fjz2 = _mm_add_pd(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 if (gmx_mm_any_lt(rsq10,rcutoff2))
456 r10 = _mm_mul_pd(rsq10,rinv10);
458 /* EWALD ELECTROSTATICS */
460 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
461 ewrt = _mm_mul_pd(r10,ewtabscale);
462 ewitab = _mm_cvttpd_epi32(ewrt);
463 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
464 ewitab = _mm_slli_epi32(ewitab,2);
465 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
466 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
467 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
468 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
469 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
470 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
471 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
472 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
473 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
474 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
476 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
478 /* Update potential sum for this i atom from the interaction with this j atom. */
479 velec = _mm_and_pd(velec,cutoff_mask);
480 velecsum = _mm_add_pd(velecsum,velec);
484 fscal = _mm_and_pd(fscal,cutoff_mask);
486 /* Calculate temporary vectorial force */
487 tx = _mm_mul_pd(fscal,dx10);
488 ty = _mm_mul_pd(fscal,dy10);
489 tz = _mm_mul_pd(fscal,dz10);
491 /* Update vectorial force */
492 fix1 = _mm_add_pd(fix1,tx);
493 fiy1 = _mm_add_pd(fiy1,ty);
494 fiz1 = _mm_add_pd(fiz1,tz);
496 fjx0 = _mm_add_pd(fjx0,tx);
497 fjy0 = _mm_add_pd(fjy0,ty);
498 fjz0 = _mm_add_pd(fjz0,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 if (gmx_mm_any_lt(rsq11,rcutoff2))
509 r11 = _mm_mul_pd(rsq11,rinv11);
511 /* EWALD ELECTROSTATICS */
513 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
514 ewrt = _mm_mul_pd(r11,ewtabscale);
515 ewitab = _mm_cvttpd_epi32(ewrt);
516 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
517 ewitab = _mm_slli_epi32(ewitab,2);
518 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
519 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
520 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
521 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
522 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
523 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
524 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
525 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
526 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
527 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
529 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velec = _mm_and_pd(velec,cutoff_mask);
533 velecsum = _mm_add_pd(velecsum,velec);
537 fscal = _mm_and_pd(fscal,cutoff_mask);
539 /* Calculate temporary vectorial force */
540 tx = _mm_mul_pd(fscal,dx11);
541 ty = _mm_mul_pd(fscal,dy11);
542 tz = _mm_mul_pd(fscal,dz11);
544 /* Update vectorial force */
545 fix1 = _mm_add_pd(fix1,tx);
546 fiy1 = _mm_add_pd(fiy1,ty);
547 fiz1 = _mm_add_pd(fiz1,tz);
549 fjx1 = _mm_add_pd(fjx1,tx);
550 fjy1 = _mm_add_pd(fjy1,ty);
551 fjz1 = _mm_add_pd(fjz1,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 if (gmx_mm_any_lt(rsq12,rcutoff2))
562 r12 = _mm_mul_pd(rsq12,rinv12);
564 /* EWALD ELECTROSTATICS */
566 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
567 ewrt = _mm_mul_pd(r12,ewtabscale);
568 ewitab = _mm_cvttpd_epi32(ewrt);
569 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
570 ewitab = _mm_slli_epi32(ewitab,2);
571 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
572 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
573 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
574 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
575 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
576 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
577 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
578 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
579 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
580 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
582 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
584 /* Update potential sum for this i atom from the interaction with this j atom. */
585 velec = _mm_and_pd(velec,cutoff_mask);
586 velecsum = _mm_add_pd(velecsum,velec);
590 fscal = _mm_and_pd(fscal,cutoff_mask);
592 /* Calculate temporary vectorial force */
593 tx = _mm_mul_pd(fscal,dx12);
594 ty = _mm_mul_pd(fscal,dy12);
595 tz = _mm_mul_pd(fscal,dz12);
597 /* Update vectorial force */
598 fix1 = _mm_add_pd(fix1,tx);
599 fiy1 = _mm_add_pd(fiy1,ty);
600 fiz1 = _mm_add_pd(fiz1,tz);
602 fjx2 = _mm_add_pd(fjx2,tx);
603 fjy2 = _mm_add_pd(fjy2,ty);
604 fjz2 = _mm_add_pd(fjz2,tz);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 if (gmx_mm_any_lt(rsq20,rcutoff2))
615 r20 = _mm_mul_pd(rsq20,rinv20);
617 /* EWALD ELECTROSTATICS */
619 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
620 ewrt = _mm_mul_pd(r20,ewtabscale);
621 ewitab = _mm_cvttpd_epi32(ewrt);
622 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
623 ewitab = _mm_slli_epi32(ewitab,2);
624 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
625 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
626 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
627 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
628 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
629 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
630 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
631 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
632 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
633 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
635 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
637 /* Update potential sum for this i atom from the interaction with this j atom. */
638 velec = _mm_and_pd(velec,cutoff_mask);
639 velecsum = _mm_add_pd(velecsum,velec);
643 fscal = _mm_and_pd(fscal,cutoff_mask);
645 /* Calculate temporary vectorial force */
646 tx = _mm_mul_pd(fscal,dx20);
647 ty = _mm_mul_pd(fscal,dy20);
648 tz = _mm_mul_pd(fscal,dz20);
650 /* Update vectorial force */
651 fix2 = _mm_add_pd(fix2,tx);
652 fiy2 = _mm_add_pd(fiy2,ty);
653 fiz2 = _mm_add_pd(fiz2,tz);
655 fjx0 = _mm_add_pd(fjx0,tx);
656 fjy0 = _mm_add_pd(fjy0,ty);
657 fjz0 = _mm_add_pd(fjz0,tz);
661 /**************************
662 * CALCULATE INTERACTIONS *
663 **************************/
665 if (gmx_mm_any_lt(rsq21,rcutoff2))
668 r21 = _mm_mul_pd(rsq21,rinv21);
670 /* EWALD ELECTROSTATICS */
672 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
673 ewrt = _mm_mul_pd(r21,ewtabscale);
674 ewitab = _mm_cvttpd_epi32(ewrt);
675 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
676 ewitab = _mm_slli_epi32(ewitab,2);
677 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
678 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
679 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
680 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
681 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
682 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
683 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
684 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
685 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
686 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
688 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velec = _mm_and_pd(velec,cutoff_mask);
692 velecsum = _mm_add_pd(velecsum,velec);
696 fscal = _mm_and_pd(fscal,cutoff_mask);
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_pd(fscal,dx21);
700 ty = _mm_mul_pd(fscal,dy21);
701 tz = _mm_mul_pd(fscal,dz21);
703 /* Update vectorial force */
704 fix2 = _mm_add_pd(fix2,tx);
705 fiy2 = _mm_add_pd(fiy2,ty);
706 fiz2 = _mm_add_pd(fiz2,tz);
708 fjx1 = _mm_add_pd(fjx1,tx);
709 fjy1 = _mm_add_pd(fjy1,ty);
710 fjz1 = _mm_add_pd(fjz1,tz);
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 if (gmx_mm_any_lt(rsq22,rcutoff2))
721 r22 = _mm_mul_pd(rsq22,rinv22);
723 /* EWALD ELECTROSTATICS */
725 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
726 ewrt = _mm_mul_pd(r22,ewtabscale);
727 ewitab = _mm_cvttpd_epi32(ewrt);
728 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
729 ewitab = _mm_slli_epi32(ewitab,2);
730 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
731 ewtabD = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,1) );
732 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
733 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
734 ewtabFn = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,1) +2);
735 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
736 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
737 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
738 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
739 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
741 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
743 /* Update potential sum for this i atom from the interaction with this j atom. */
744 velec = _mm_and_pd(velec,cutoff_mask);
745 velecsum = _mm_add_pd(velecsum,velec);
749 fscal = _mm_and_pd(fscal,cutoff_mask);
751 /* Calculate temporary vectorial force */
752 tx = _mm_mul_pd(fscal,dx22);
753 ty = _mm_mul_pd(fscal,dy22);
754 tz = _mm_mul_pd(fscal,dz22);
756 /* Update vectorial force */
757 fix2 = _mm_add_pd(fix2,tx);
758 fiy2 = _mm_add_pd(fiy2,ty);
759 fiz2 = _mm_add_pd(fiz2,tz);
761 fjx2 = _mm_add_pd(fjx2,tx);
762 fjy2 = _mm_add_pd(fjy2,ty);
763 fjz2 = _mm_add_pd(fjz2,tz);
767 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
769 /* Inner loop uses 432 flops */
776 j_coord_offsetA = DIM*jnrA;
778 /* load j atom coordinates */
779 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
780 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
782 /* Calculate displacement vector */
783 dx00 = _mm_sub_pd(ix0,jx0);
784 dy00 = _mm_sub_pd(iy0,jy0);
785 dz00 = _mm_sub_pd(iz0,jz0);
786 dx01 = _mm_sub_pd(ix0,jx1);
787 dy01 = _mm_sub_pd(iy0,jy1);
788 dz01 = _mm_sub_pd(iz0,jz1);
789 dx02 = _mm_sub_pd(ix0,jx2);
790 dy02 = _mm_sub_pd(iy0,jy2);
791 dz02 = _mm_sub_pd(iz0,jz2);
792 dx10 = _mm_sub_pd(ix1,jx0);
793 dy10 = _mm_sub_pd(iy1,jy0);
794 dz10 = _mm_sub_pd(iz1,jz0);
795 dx11 = _mm_sub_pd(ix1,jx1);
796 dy11 = _mm_sub_pd(iy1,jy1);
797 dz11 = _mm_sub_pd(iz1,jz1);
798 dx12 = _mm_sub_pd(ix1,jx2);
799 dy12 = _mm_sub_pd(iy1,jy2);
800 dz12 = _mm_sub_pd(iz1,jz2);
801 dx20 = _mm_sub_pd(ix2,jx0);
802 dy20 = _mm_sub_pd(iy2,jy0);
803 dz20 = _mm_sub_pd(iz2,jz0);
804 dx21 = _mm_sub_pd(ix2,jx1);
805 dy21 = _mm_sub_pd(iy2,jy1);
806 dz21 = _mm_sub_pd(iz2,jz1);
807 dx22 = _mm_sub_pd(ix2,jx2);
808 dy22 = _mm_sub_pd(iy2,jy2);
809 dz22 = _mm_sub_pd(iz2,jz2);
811 /* Calculate squared distance and things based on it */
812 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
813 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
814 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
815 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
816 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
817 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
818 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
819 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
820 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
822 rinv00 = gmx_mm_invsqrt_pd(rsq00);
823 rinv01 = gmx_mm_invsqrt_pd(rsq01);
824 rinv02 = gmx_mm_invsqrt_pd(rsq02);
825 rinv10 = gmx_mm_invsqrt_pd(rsq10);
826 rinv11 = gmx_mm_invsqrt_pd(rsq11);
827 rinv12 = gmx_mm_invsqrt_pd(rsq12);
828 rinv20 = gmx_mm_invsqrt_pd(rsq20);
829 rinv21 = gmx_mm_invsqrt_pd(rsq21);
830 rinv22 = gmx_mm_invsqrt_pd(rsq22);
832 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
833 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
834 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
835 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
836 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
837 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
838 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
839 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
840 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
842 fjx0 = _mm_setzero_pd();
843 fjy0 = _mm_setzero_pd();
844 fjz0 = _mm_setzero_pd();
845 fjx1 = _mm_setzero_pd();
846 fjy1 = _mm_setzero_pd();
847 fjz1 = _mm_setzero_pd();
848 fjx2 = _mm_setzero_pd();
849 fjy2 = _mm_setzero_pd();
850 fjz2 = _mm_setzero_pd();
852 /**************************
853 * CALCULATE INTERACTIONS *
854 **************************/
856 if (gmx_mm_any_lt(rsq00,rcutoff2))
859 r00 = _mm_mul_pd(rsq00,rinv00);
861 /* EWALD ELECTROSTATICS */
863 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
864 ewrt = _mm_mul_pd(r00,ewtabscale);
865 ewitab = _mm_cvttpd_epi32(ewrt);
866 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
867 ewitab = _mm_slli_epi32(ewitab,2);
868 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
869 ewtabD = _mm_setzero_pd();
870 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
871 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
872 ewtabFn = _mm_setzero_pd();
873 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
874 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
875 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
876 velec = _mm_mul_pd(qq00,_mm_sub_pd(_mm_sub_pd(rinv00,sh_ewald),velec));
877 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
879 /* LENNARD-JONES DISPERSION/REPULSION */
881 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
882 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
883 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
884 vvdw = _mm_sub_pd(_mm_mul_pd( _mm_sub_pd(vvdw12 , _mm_mul_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
885 _mm_mul_pd( _mm_sub_pd(vvdw6,_mm_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
886 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
888 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
890 /* Update potential sum for this i atom from the interaction with this j atom. */
891 velec = _mm_and_pd(velec,cutoff_mask);
892 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
893 velecsum = _mm_add_pd(velecsum,velec);
894 vvdw = _mm_and_pd(vvdw,cutoff_mask);
895 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
896 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
898 fscal = _mm_add_pd(felec,fvdw);
900 fscal = _mm_and_pd(fscal,cutoff_mask);
902 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
904 /* Calculate temporary vectorial force */
905 tx = _mm_mul_pd(fscal,dx00);
906 ty = _mm_mul_pd(fscal,dy00);
907 tz = _mm_mul_pd(fscal,dz00);
909 /* Update vectorial force */
910 fix0 = _mm_add_pd(fix0,tx);
911 fiy0 = _mm_add_pd(fiy0,ty);
912 fiz0 = _mm_add_pd(fiz0,tz);
914 fjx0 = _mm_add_pd(fjx0,tx);
915 fjy0 = _mm_add_pd(fjy0,ty);
916 fjz0 = _mm_add_pd(fjz0,tz);
920 /**************************
921 * CALCULATE INTERACTIONS *
922 **************************/
924 if (gmx_mm_any_lt(rsq01,rcutoff2))
927 r01 = _mm_mul_pd(rsq01,rinv01);
929 /* EWALD ELECTROSTATICS */
931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932 ewrt = _mm_mul_pd(r01,ewtabscale);
933 ewitab = _mm_cvttpd_epi32(ewrt);
934 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
935 ewitab = _mm_slli_epi32(ewitab,2);
936 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
937 ewtabD = _mm_setzero_pd();
938 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
939 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
940 ewtabFn = _mm_setzero_pd();
941 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
942 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
943 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
944 velec = _mm_mul_pd(qq01,_mm_sub_pd(_mm_sub_pd(rinv01,sh_ewald),velec));
945 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
947 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm_and_pd(velec,cutoff_mask);
951 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
952 velecsum = _mm_add_pd(velecsum,velec);
956 fscal = _mm_and_pd(fscal,cutoff_mask);
958 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
960 /* Calculate temporary vectorial force */
961 tx = _mm_mul_pd(fscal,dx01);
962 ty = _mm_mul_pd(fscal,dy01);
963 tz = _mm_mul_pd(fscal,dz01);
965 /* Update vectorial force */
966 fix0 = _mm_add_pd(fix0,tx);
967 fiy0 = _mm_add_pd(fiy0,ty);
968 fiz0 = _mm_add_pd(fiz0,tz);
970 fjx1 = _mm_add_pd(fjx1,tx);
971 fjy1 = _mm_add_pd(fjy1,ty);
972 fjz1 = _mm_add_pd(fjz1,tz);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 if (gmx_mm_any_lt(rsq02,rcutoff2))
983 r02 = _mm_mul_pd(rsq02,rinv02);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt = _mm_mul_pd(r02,ewtabscale);
989 ewitab = _mm_cvttpd_epi32(ewrt);
990 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
991 ewitab = _mm_slli_epi32(ewitab,2);
992 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
993 ewtabD = _mm_setzero_pd();
994 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
995 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
996 ewtabFn = _mm_setzero_pd();
997 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
998 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
999 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1000 velec = _mm_mul_pd(qq02,_mm_sub_pd(_mm_sub_pd(rinv02,sh_ewald),velec));
1001 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1003 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1005 /* Update potential sum for this i atom from the interaction with this j atom. */
1006 velec = _mm_and_pd(velec,cutoff_mask);
1007 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1008 velecsum = _mm_add_pd(velecsum,velec);
1012 fscal = _mm_and_pd(fscal,cutoff_mask);
1014 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1016 /* Calculate temporary vectorial force */
1017 tx = _mm_mul_pd(fscal,dx02);
1018 ty = _mm_mul_pd(fscal,dy02);
1019 tz = _mm_mul_pd(fscal,dz02);
1021 /* Update vectorial force */
1022 fix0 = _mm_add_pd(fix0,tx);
1023 fiy0 = _mm_add_pd(fiy0,ty);
1024 fiz0 = _mm_add_pd(fiz0,tz);
1026 fjx2 = _mm_add_pd(fjx2,tx);
1027 fjy2 = _mm_add_pd(fjy2,ty);
1028 fjz2 = _mm_add_pd(fjz2,tz);
1032 /**************************
1033 * CALCULATE INTERACTIONS *
1034 **************************/
1036 if (gmx_mm_any_lt(rsq10,rcutoff2))
1039 r10 = _mm_mul_pd(rsq10,rinv10);
1041 /* EWALD ELECTROSTATICS */
1043 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1044 ewrt = _mm_mul_pd(r10,ewtabscale);
1045 ewitab = _mm_cvttpd_epi32(ewrt);
1046 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1047 ewitab = _mm_slli_epi32(ewitab,2);
1048 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1049 ewtabD = _mm_setzero_pd();
1050 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1051 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1052 ewtabFn = _mm_setzero_pd();
1053 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1054 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1055 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1056 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
1057 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1059 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1061 /* Update potential sum for this i atom from the interaction with this j atom. */
1062 velec = _mm_and_pd(velec,cutoff_mask);
1063 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1064 velecsum = _mm_add_pd(velecsum,velec);
1068 fscal = _mm_and_pd(fscal,cutoff_mask);
1070 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1072 /* Calculate temporary vectorial force */
1073 tx = _mm_mul_pd(fscal,dx10);
1074 ty = _mm_mul_pd(fscal,dy10);
1075 tz = _mm_mul_pd(fscal,dz10);
1077 /* Update vectorial force */
1078 fix1 = _mm_add_pd(fix1,tx);
1079 fiy1 = _mm_add_pd(fiy1,ty);
1080 fiz1 = _mm_add_pd(fiz1,tz);
1082 fjx0 = _mm_add_pd(fjx0,tx);
1083 fjy0 = _mm_add_pd(fjy0,ty);
1084 fjz0 = _mm_add_pd(fjz0,tz);
1088 /**************************
1089 * CALCULATE INTERACTIONS *
1090 **************************/
1092 if (gmx_mm_any_lt(rsq11,rcutoff2))
1095 r11 = _mm_mul_pd(rsq11,rinv11);
1097 /* EWALD ELECTROSTATICS */
1099 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1100 ewrt = _mm_mul_pd(r11,ewtabscale);
1101 ewitab = _mm_cvttpd_epi32(ewrt);
1102 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1103 ewitab = _mm_slli_epi32(ewitab,2);
1104 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1105 ewtabD = _mm_setzero_pd();
1106 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1107 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1108 ewtabFn = _mm_setzero_pd();
1109 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1110 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1111 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1112 velec = _mm_mul_pd(qq11,_mm_sub_pd(_mm_sub_pd(rinv11,sh_ewald),velec));
1113 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1115 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1117 /* Update potential sum for this i atom from the interaction with this j atom. */
1118 velec = _mm_and_pd(velec,cutoff_mask);
1119 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1120 velecsum = _mm_add_pd(velecsum,velec);
1124 fscal = _mm_and_pd(fscal,cutoff_mask);
1126 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1128 /* Calculate temporary vectorial force */
1129 tx = _mm_mul_pd(fscal,dx11);
1130 ty = _mm_mul_pd(fscal,dy11);
1131 tz = _mm_mul_pd(fscal,dz11);
1133 /* Update vectorial force */
1134 fix1 = _mm_add_pd(fix1,tx);
1135 fiy1 = _mm_add_pd(fiy1,ty);
1136 fiz1 = _mm_add_pd(fiz1,tz);
1138 fjx1 = _mm_add_pd(fjx1,tx);
1139 fjy1 = _mm_add_pd(fjy1,ty);
1140 fjz1 = _mm_add_pd(fjz1,tz);
1144 /**************************
1145 * CALCULATE INTERACTIONS *
1146 **************************/
1148 if (gmx_mm_any_lt(rsq12,rcutoff2))
1151 r12 = _mm_mul_pd(rsq12,rinv12);
1153 /* EWALD ELECTROSTATICS */
1155 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1156 ewrt = _mm_mul_pd(r12,ewtabscale);
1157 ewitab = _mm_cvttpd_epi32(ewrt);
1158 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1159 ewitab = _mm_slli_epi32(ewitab,2);
1160 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1161 ewtabD = _mm_setzero_pd();
1162 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1163 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1164 ewtabFn = _mm_setzero_pd();
1165 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1166 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1167 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1168 velec = _mm_mul_pd(qq12,_mm_sub_pd(_mm_sub_pd(rinv12,sh_ewald),velec));
1169 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1171 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1173 /* Update potential sum for this i atom from the interaction with this j atom. */
1174 velec = _mm_and_pd(velec,cutoff_mask);
1175 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1176 velecsum = _mm_add_pd(velecsum,velec);
1180 fscal = _mm_and_pd(fscal,cutoff_mask);
1182 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1184 /* Calculate temporary vectorial force */
1185 tx = _mm_mul_pd(fscal,dx12);
1186 ty = _mm_mul_pd(fscal,dy12);
1187 tz = _mm_mul_pd(fscal,dz12);
1189 /* Update vectorial force */
1190 fix1 = _mm_add_pd(fix1,tx);
1191 fiy1 = _mm_add_pd(fiy1,ty);
1192 fiz1 = _mm_add_pd(fiz1,tz);
1194 fjx2 = _mm_add_pd(fjx2,tx);
1195 fjy2 = _mm_add_pd(fjy2,ty);
1196 fjz2 = _mm_add_pd(fjz2,tz);
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 if (gmx_mm_any_lt(rsq20,rcutoff2))
1207 r20 = _mm_mul_pd(rsq20,rinv20);
1209 /* EWALD ELECTROSTATICS */
1211 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1212 ewrt = _mm_mul_pd(r20,ewtabscale);
1213 ewitab = _mm_cvttpd_epi32(ewrt);
1214 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1215 ewitab = _mm_slli_epi32(ewitab,2);
1216 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1217 ewtabD = _mm_setzero_pd();
1218 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1219 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1220 ewtabFn = _mm_setzero_pd();
1221 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1222 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1223 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1224 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
1225 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1227 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1229 /* Update potential sum for this i atom from the interaction with this j atom. */
1230 velec = _mm_and_pd(velec,cutoff_mask);
1231 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1232 velecsum = _mm_add_pd(velecsum,velec);
1236 fscal = _mm_and_pd(fscal,cutoff_mask);
1238 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1240 /* Calculate temporary vectorial force */
1241 tx = _mm_mul_pd(fscal,dx20);
1242 ty = _mm_mul_pd(fscal,dy20);
1243 tz = _mm_mul_pd(fscal,dz20);
1245 /* Update vectorial force */
1246 fix2 = _mm_add_pd(fix2,tx);
1247 fiy2 = _mm_add_pd(fiy2,ty);
1248 fiz2 = _mm_add_pd(fiz2,tz);
1250 fjx0 = _mm_add_pd(fjx0,tx);
1251 fjy0 = _mm_add_pd(fjy0,ty);
1252 fjz0 = _mm_add_pd(fjz0,tz);
1256 /**************************
1257 * CALCULATE INTERACTIONS *
1258 **************************/
1260 if (gmx_mm_any_lt(rsq21,rcutoff2))
1263 r21 = _mm_mul_pd(rsq21,rinv21);
1265 /* EWALD ELECTROSTATICS */
1267 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268 ewrt = _mm_mul_pd(r21,ewtabscale);
1269 ewitab = _mm_cvttpd_epi32(ewrt);
1270 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1271 ewitab = _mm_slli_epi32(ewitab,2);
1272 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1273 ewtabD = _mm_setzero_pd();
1274 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1275 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1276 ewtabFn = _mm_setzero_pd();
1277 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1278 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1279 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1280 velec = _mm_mul_pd(qq21,_mm_sub_pd(_mm_sub_pd(rinv21,sh_ewald),velec));
1281 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1283 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1285 /* Update potential sum for this i atom from the interaction with this j atom. */
1286 velec = _mm_and_pd(velec,cutoff_mask);
1287 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1288 velecsum = _mm_add_pd(velecsum,velec);
1292 fscal = _mm_and_pd(fscal,cutoff_mask);
1294 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1296 /* Calculate temporary vectorial force */
1297 tx = _mm_mul_pd(fscal,dx21);
1298 ty = _mm_mul_pd(fscal,dy21);
1299 tz = _mm_mul_pd(fscal,dz21);
1301 /* Update vectorial force */
1302 fix2 = _mm_add_pd(fix2,tx);
1303 fiy2 = _mm_add_pd(fiy2,ty);
1304 fiz2 = _mm_add_pd(fiz2,tz);
1306 fjx1 = _mm_add_pd(fjx1,tx);
1307 fjy1 = _mm_add_pd(fjy1,ty);
1308 fjz1 = _mm_add_pd(fjz1,tz);
1312 /**************************
1313 * CALCULATE INTERACTIONS *
1314 **************************/
1316 if (gmx_mm_any_lt(rsq22,rcutoff2))
1319 r22 = _mm_mul_pd(rsq22,rinv22);
1321 /* EWALD ELECTROSTATICS */
1323 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1324 ewrt = _mm_mul_pd(r22,ewtabscale);
1325 ewitab = _mm_cvttpd_epi32(ewrt);
1326 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1327 ewitab = _mm_slli_epi32(ewitab,2);
1328 ewtabF = _mm_load_pd( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1329 ewtabD = _mm_setzero_pd();
1330 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
1331 ewtabV = _mm_load_sd( ewtab + gmx_mm_extract_epi32(ewitab,0) +2);
1332 ewtabFn = _mm_setzero_pd();
1333 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
1334 felec = _mm_add_pd(ewtabF,_mm_mul_pd(eweps,ewtabD));
1335 velec = _mm_sub_pd(ewtabV,_mm_mul_pd(_mm_mul_pd(ewtabhalfspace,eweps),_mm_add_pd(ewtabF,felec)));
1336 velec = _mm_mul_pd(qq22,_mm_sub_pd(_mm_sub_pd(rinv22,sh_ewald),velec));
1337 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1339 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
1341 /* Update potential sum for this i atom from the interaction with this j atom. */
1342 velec = _mm_and_pd(velec,cutoff_mask);
1343 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1344 velecsum = _mm_add_pd(velecsum,velec);
1348 fscal = _mm_and_pd(fscal,cutoff_mask);
1350 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1352 /* Calculate temporary vectorial force */
1353 tx = _mm_mul_pd(fscal,dx22);
1354 ty = _mm_mul_pd(fscal,dy22);
1355 tz = _mm_mul_pd(fscal,dz22);
1357 /* Update vectorial force */
1358 fix2 = _mm_add_pd(fix2,tx);
1359 fiy2 = _mm_add_pd(fiy2,ty);
1360 fiz2 = _mm_add_pd(fiz2,tz);
1362 fjx2 = _mm_add_pd(fjx2,tx);
1363 fjy2 = _mm_add_pd(fjy2,ty);
1364 fjz2 = _mm_add_pd(fjz2,tz);
1368 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1370 /* Inner loop uses 432 flops */
1373 /* End of innermost loop */
1375 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1376 f+i_coord_offset,fshift+i_shift_offset);
1379 /* Update potential energies */
1380 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1381 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1383 /* Increment number of inner iterations */
1384 inneriter += j_index_end - j_index_start;
1386 /* Outer loop uses 20 flops */
1389 /* Increment number of outer iterations */
1392 /* Update outer/inner flops */
1394 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*432);
1397 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double
1398 * Electrostatics interaction: Ewald
1399 * VdW interaction: LennardJones
1400 * Geometry: Water3-Water3
1401 * Calculate force/pot: Force
1404 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse4_1_double
1405 (t_nblist * gmx_restrict nlist,
1406 rvec * gmx_restrict xx,
1407 rvec * gmx_restrict ff,
1408 t_forcerec * gmx_restrict fr,
1409 t_mdatoms * gmx_restrict mdatoms,
1410 nb_kernel_data_t * gmx_restrict kernel_data,
1411 t_nrnb * gmx_restrict nrnb)
1413 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1414 * just 0 for non-waters.
1415 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1416 * jnr indices corresponding to data put in the four positions in the SIMD register.
1418 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1419 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1421 int j_coord_offsetA,j_coord_offsetB;
1422 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1423 real rcutoff_scalar;
1424 real *shiftvec,*fshift,*x,*f;
1425 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1427 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1429 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1431 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1432 int vdwjidx0A,vdwjidx0B;
1433 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1434 int vdwjidx1A,vdwjidx1B;
1435 __m128d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1436 int vdwjidx2A,vdwjidx2B;
1437 __m128d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1438 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1439 __m128d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1440 __m128d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1441 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1442 __m128d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1443 __m128d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1444 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1445 __m128d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1446 __m128d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1447 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
1450 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1453 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
1454 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
1456 __m128d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1458 __m128d dummy_mask,cutoff_mask;
1459 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1460 __m128d one = _mm_set1_pd(1.0);
1461 __m128d two = _mm_set1_pd(2.0);
1467 jindex = nlist->jindex;
1469 shiftidx = nlist->shift;
1471 shiftvec = fr->shift_vec[0];
1472 fshift = fr->fshift[0];
1473 facel = _mm_set1_pd(fr->epsfac);
1474 charge = mdatoms->chargeA;
1475 nvdwtype = fr->ntype;
1476 vdwparam = fr->nbfp;
1477 vdwtype = mdatoms->typeA;
1479 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
1480 ewtab = fr->ic->tabq_coul_F;
1481 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
1482 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
1484 /* Setup water-specific parameters */
1485 inr = nlist->iinr[0];
1486 iq0 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1487 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1488 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1489 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1491 jq0 = _mm_set1_pd(charge[inr+0]);
1492 jq1 = _mm_set1_pd(charge[inr+1]);
1493 jq2 = _mm_set1_pd(charge[inr+2]);
1494 vdwjidx0A = 2*vdwtype[inr+0];
1495 qq00 = _mm_mul_pd(iq0,jq0);
1496 c6_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1497 c12_00 = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1498 qq01 = _mm_mul_pd(iq0,jq1);
1499 qq02 = _mm_mul_pd(iq0,jq2);
1500 qq10 = _mm_mul_pd(iq1,jq0);
1501 qq11 = _mm_mul_pd(iq1,jq1);
1502 qq12 = _mm_mul_pd(iq1,jq2);
1503 qq20 = _mm_mul_pd(iq2,jq0);
1504 qq21 = _mm_mul_pd(iq2,jq1);
1505 qq22 = _mm_mul_pd(iq2,jq2);
1507 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1508 rcutoff_scalar = fr->rcoulomb;
1509 rcutoff = _mm_set1_pd(rcutoff_scalar);
1510 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
1512 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
1513 rvdw = _mm_set1_pd(fr->rvdw);
1515 /* Avoid stupid compiler warnings */
1517 j_coord_offsetA = 0;
1518 j_coord_offsetB = 0;
1523 /* Start outer loop over neighborlists */
1524 for(iidx=0; iidx<nri; iidx++)
1526 /* Load shift vector for this list */
1527 i_shift_offset = DIM*shiftidx[iidx];
1529 /* Load limits for loop over neighbors */
1530 j_index_start = jindex[iidx];
1531 j_index_end = jindex[iidx+1];
1533 /* Get outer coordinate index */
1535 i_coord_offset = DIM*inr;
1537 /* Load i particle coords and add shift vector */
1538 gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1539 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1541 fix0 = _mm_setzero_pd();
1542 fiy0 = _mm_setzero_pd();
1543 fiz0 = _mm_setzero_pd();
1544 fix1 = _mm_setzero_pd();
1545 fiy1 = _mm_setzero_pd();
1546 fiz1 = _mm_setzero_pd();
1547 fix2 = _mm_setzero_pd();
1548 fiy2 = _mm_setzero_pd();
1549 fiz2 = _mm_setzero_pd();
1551 /* Start inner kernel loop */
1552 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1555 /* Get j neighbor index, and coordinate index */
1557 jnrB = jjnr[jidx+1];
1558 j_coord_offsetA = DIM*jnrA;
1559 j_coord_offsetB = DIM*jnrB;
1561 /* load j atom coordinates */
1562 gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1563 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1565 /* Calculate displacement vector */
1566 dx00 = _mm_sub_pd(ix0,jx0);
1567 dy00 = _mm_sub_pd(iy0,jy0);
1568 dz00 = _mm_sub_pd(iz0,jz0);
1569 dx01 = _mm_sub_pd(ix0,jx1);
1570 dy01 = _mm_sub_pd(iy0,jy1);
1571 dz01 = _mm_sub_pd(iz0,jz1);
1572 dx02 = _mm_sub_pd(ix0,jx2);
1573 dy02 = _mm_sub_pd(iy0,jy2);
1574 dz02 = _mm_sub_pd(iz0,jz2);
1575 dx10 = _mm_sub_pd(ix1,jx0);
1576 dy10 = _mm_sub_pd(iy1,jy0);
1577 dz10 = _mm_sub_pd(iz1,jz0);
1578 dx11 = _mm_sub_pd(ix1,jx1);
1579 dy11 = _mm_sub_pd(iy1,jy1);
1580 dz11 = _mm_sub_pd(iz1,jz1);
1581 dx12 = _mm_sub_pd(ix1,jx2);
1582 dy12 = _mm_sub_pd(iy1,jy2);
1583 dz12 = _mm_sub_pd(iz1,jz2);
1584 dx20 = _mm_sub_pd(ix2,jx0);
1585 dy20 = _mm_sub_pd(iy2,jy0);
1586 dz20 = _mm_sub_pd(iz2,jz0);
1587 dx21 = _mm_sub_pd(ix2,jx1);
1588 dy21 = _mm_sub_pd(iy2,jy1);
1589 dz21 = _mm_sub_pd(iz2,jz1);
1590 dx22 = _mm_sub_pd(ix2,jx2);
1591 dy22 = _mm_sub_pd(iy2,jy2);
1592 dz22 = _mm_sub_pd(iz2,jz2);
1594 /* Calculate squared distance and things based on it */
1595 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1596 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1597 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1598 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1599 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1600 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1601 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1602 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1603 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1605 rinv00 = gmx_mm_invsqrt_pd(rsq00);
1606 rinv01 = gmx_mm_invsqrt_pd(rsq01);
1607 rinv02 = gmx_mm_invsqrt_pd(rsq02);
1608 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1609 rinv11 = gmx_mm_invsqrt_pd(rsq11);
1610 rinv12 = gmx_mm_invsqrt_pd(rsq12);
1611 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1612 rinv21 = gmx_mm_invsqrt_pd(rsq21);
1613 rinv22 = gmx_mm_invsqrt_pd(rsq22);
1615 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
1616 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
1617 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
1618 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1619 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
1620 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
1621 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1622 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
1623 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
1625 fjx0 = _mm_setzero_pd();
1626 fjy0 = _mm_setzero_pd();
1627 fjz0 = _mm_setzero_pd();
1628 fjx1 = _mm_setzero_pd();
1629 fjy1 = _mm_setzero_pd();
1630 fjz1 = _mm_setzero_pd();
1631 fjx2 = _mm_setzero_pd();
1632 fjy2 = _mm_setzero_pd();
1633 fjz2 = _mm_setzero_pd();
1635 /**************************
1636 * CALCULATE INTERACTIONS *
1637 **************************/
1639 if (gmx_mm_any_lt(rsq00,rcutoff2))
1642 r00 = _mm_mul_pd(rsq00,rinv00);
1644 /* EWALD ELECTROSTATICS */
1646 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1647 ewrt = _mm_mul_pd(r00,ewtabscale);
1648 ewitab = _mm_cvttpd_epi32(ewrt);
1649 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1650 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1652 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1653 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
1655 /* LENNARD-JONES DISPERSION/REPULSION */
1657 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1658 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1660 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1662 fscal = _mm_add_pd(felec,fvdw);
1664 fscal = _mm_and_pd(fscal,cutoff_mask);
1666 /* Calculate temporary vectorial force */
1667 tx = _mm_mul_pd(fscal,dx00);
1668 ty = _mm_mul_pd(fscal,dy00);
1669 tz = _mm_mul_pd(fscal,dz00);
1671 /* Update vectorial force */
1672 fix0 = _mm_add_pd(fix0,tx);
1673 fiy0 = _mm_add_pd(fiy0,ty);
1674 fiz0 = _mm_add_pd(fiz0,tz);
1676 fjx0 = _mm_add_pd(fjx0,tx);
1677 fjy0 = _mm_add_pd(fjy0,ty);
1678 fjz0 = _mm_add_pd(fjz0,tz);
1682 /**************************
1683 * CALCULATE INTERACTIONS *
1684 **************************/
1686 if (gmx_mm_any_lt(rsq01,rcutoff2))
1689 r01 = _mm_mul_pd(rsq01,rinv01);
1691 /* EWALD ELECTROSTATICS */
1693 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1694 ewrt = _mm_mul_pd(r01,ewtabscale);
1695 ewitab = _mm_cvttpd_epi32(ewrt);
1696 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1697 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1699 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1700 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
1702 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
1706 fscal = _mm_and_pd(fscal,cutoff_mask);
1708 /* Calculate temporary vectorial force */
1709 tx = _mm_mul_pd(fscal,dx01);
1710 ty = _mm_mul_pd(fscal,dy01);
1711 tz = _mm_mul_pd(fscal,dz01);
1713 /* Update vectorial force */
1714 fix0 = _mm_add_pd(fix0,tx);
1715 fiy0 = _mm_add_pd(fiy0,ty);
1716 fiz0 = _mm_add_pd(fiz0,tz);
1718 fjx1 = _mm_add_pd(fjx1,tx);
1719 fjy1 = _mm_add_pd(fjy1,ty);
1720 fjz1 = _mm_add_pd(fjz1,tz);
1724 /**************************
1725 * CALCULATE INTERACTIONS *
1726 **************************/
1728 if (gmx_mm_any_lt(rsq02,rcutoff2))
1731 r02 = _mm_mul_pd(rsq02,rinv02);
1733 /* EWALD ELECTROSTATICS */
1735 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1736 ewrt = _mm_mul_pd(r02,ewtabscale);
1737 ewitab = _mm_cvttpd_epi32(ewrt);
1738 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1739 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1741 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1742 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
1744 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
1748 fscal = _mm_and_pd(fscal,cutoff_mask);
1750 /* Calculate temporary vectorial force */
1751 tx = _mm_mul_pd(fscal,dx02);
1752 ty = _mm_mul_pd(fscal,dy02);
1753 tz = _mm_mul_pd(fscal,dz02);
1755 /* Update vectorial force */
1756 fix0 = _mm_add_pd(fix0,tx);
1757 fiy0 = _mm_add_pd(fiy0,ty);
1758 fiz0 = _mm_add_pd(fiz0,tz);
1760 fjx2 = _mm_add_pd(fjx2,tx);
1761 fjy2 = _mm_add_pd(fjy2,ty);
1762 fjz2 = _mm_add_pd(fjz2,tz);
1766 /**************************
1767 * CALCULATE INTERACTIONS *
1768 **************************/
1770 if (gmx_mm_any_lt(rsq10,rcutoff2))
1773 r10 = _mm_mul_pd(rsq10,rinv10);
1775 /* EWALD ELECTROSTATICS */
1777 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1778 ewrt = _mm_mul_pd(r10,ewtabscale);
1779 ewitab = _mm_cvttpd_epi32(ewrt);
1780 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1781 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1783 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1784 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1786 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1790 fscal = _mm_and_pd(fscal,cutoff_mask);
1792 /* Calculate temporary vectorial force */
1793 tx = _mm_mul_pd(fscal,dx10);
1794 ty = _mm_mul_pd(fscal,dy10);
1795 tz = _mm_mul_pd(fscal,dz10);
1797 /* Update vectorial force */
1798 fix1 = _mm_add_pd(fix1,tx);
1799 fiy1 = _mm_add_pd(fiy1,ty);
1800 fiz1 = _mm_add_pd(fiz1,tz);
1802 fjx0 = _mm_add_pd(fjx0,tx);
1803 fjy0 = _mm_add_pd(fjy0,ty);
1804 fjz0 = _mm_add_pd(fjz0,tz);
1808 /**************************
1809 * CALCULATE INTERACTIONS *
1810 **************************/
1812 if (gmx_mm_any_lt(rsq11,rcutoff2))
1815 r11 = _mm_mul_pd(rsq11,rinv11);
1817 /* EWALD ELECTROSTATICS */
1819 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1820 ewrt = _mm_mul_pd(r11,ewtabscale);
1821 ewitab = _mm_cvttpd_epi32(ewrt);
1822 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1823 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1825 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1826 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
1828 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
1832 fscal = _mm_and_pd(fscal,cutoff_mask);
1834 /* Calculate temporary vectorial force */
1835 tx = _mm_mul_pd(fscal,dx11);
1836 ty = _mm_mul_pd(fscal,dy11);
1837 tz = _mm_mul_pd(fscal,dz11);
1839 /* Update vectorial force */
1840 fix1 = _mm_add_pd(fix1,tx);
1841 fiy1 = _mm_add_pd(fiy1,ty);
1842 fiz1 = _mm_add_pd(fiz1,tz);
1844 fjx1 = _mm_add_pd(fjx1,tx);
1845 fjy1 = _mm_add_pd(fjy1,ty);
1846 fjz1 = _mm_add_pd(fjz1,tz);
1850 /**************************
1851 * CALCULATE INTERACTIONS *
1852 **************************/
1854 if (gmx_mm_any_lt(rsq12,rcutoff2))
1857 r12 = _mm_mul_pd(rsq12,rinv12);
1859 /* EWALD ELECTROSTATICS */
1861 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1862 ewrt = _mm_mul_pd(r12,ewtabscale);
1863 ewitab = _mm_cvttpd_epi32(ewrt);
1864 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1865 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1867 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1868 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
1870 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
1874 fscal = _mm_and_pd(fscal,cutoff_mask);
1876 /* Calculate temporary vectorial force */
1877 tx = _mm_mul_pd(fscal,dx12);
1878 ty = _mm_mul_pd(fscal,dy12);
1879 tz = _mm_mul_pd(fscal,dz12);
1881 /* Update vectorial force */
1882 fix1 = _mm_add_pd(fix1,tx);
1883 fiy1 = _mm_add_pd(fiy1,ty);
1884 fiz1 = _mm_add_pd(fiz1,tz);
1886 fjx2 = _mm_add_pd(fjx2,tx);
1887 fjy2 = _mm_add_pd(fjy2,ty);
1888 fjz2 = _mm_add_pd(fjz2,tz);
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 if (gmx_mm_any_lt(rsq20,rcutoff2))
1899 r20 = _mm_mul_pd(rsq20,rinv20);
1901 /* EWALD ELECTROSTATICS */
1903 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1904 ewrt = _mm_mul_pd(r20,ewtabscale);
1905 ewitab = _mm_cvttpd_epi32(ewrt);
1906 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1907 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1909 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1910 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1912 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1916 fscal = _mm_and_pd(fscal,cutoff_mask);
1918 /* Calculate temporary vectorial force */
1919 tx = _mm_mul_pd(fscal,dx20);
1920 ty = _mm_mul_pd(fscal,dy20);
1921 tz = _mm_mul_pd(fscal,dz20);
1923 /* Update vectorial force */
1924 fix2 = _mm_add_pd(fix2,tx);
1925 fiy2 = _mm_add_pd(fiy2,ty);
1926 fiz2 = _mm_add_pd(fiz2,tz);
1928 fjx0 = _mm_add_pd(fjx0,tx);
1929 fjy0 = _mm_add_pd(fjy0,ty);
1930 fjz0 = _mm_add_pd(fjz0,tz);
1934 /**************************
1935 * CALCULATE INTERACTIONS *
1936 **************************/
1938 if (gmx_mm_any_lt(rsq21,rcutoff2))
1941 r21 = _mm_mul_pd(rsq21,rinv21);
1943 /* EWALD ELECTROSTATICS */
1945 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1946 ewrt = _mm_mul_pd(r21,ewtabscale);
1947 ewitab = _mm_cvttpd_epi32(ewrt);
1948 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1949 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1951 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1952 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
1954 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
1958 fscal = _mm_and_pd(fscal,cutoff_mask);
1960 /* Calculate temporary vectorial force */
1961 tx = _mm_mul_pd(fscal,dx21);
1962 ty = _mm_mul_pd(fscal,dy21);
1963 tz = _mm_mul_pd(fscal,dz21);
1965 /* Update vectorial force */
1966 fix2 = _mm_add_pd(fix2,tx);
1967 fiy2 = _mm_add_pd(fiy2,ty);
1968 fiz2 = _mm_add_pd(fiz2,tz);
1970 fjx1 = _mm_add_pd(fjx1,tx);
1971 fjy1 = _mm_add_pd(fjy1,ty);
1972 fjz1 = _mm_add_pd(fjz1,tz);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 if (gmx_mm_any_lt(rsq22,rcutoff2))
1983 r22 = _mm_mul_pd(rsq22,rinv22);
1985 /* EWALD ELECTROSTATICS */
1987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988 ewrt = _mm_mul_pd(r22,ewtabscale);
1989 ewitab = _mm_cvttpd_epi32(ewrt);
1990 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1991 gmx_mm_load_2pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1993 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
1994 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
1996 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2000 fscal = _mm_and_pd(fscal,cutoff_mask);
2002 /* Calculate temporary vectorial force */
2003 tx = _mm_mul_pd(fscal,dx22);
2004 ty = _mm_mul_pd(fscal,dy22);
2005 tz = _mm_mul_pd(fscal,dz22);
2007 /* Update vectorial force */
2008 fix2 = _mm_add_pd(fix2,tx);
2009 fiy2 = _mm_add_pd(fiy2,ty);
2010 fiz2 = _mm_add_pd(fiz2,tz);
2012 fjx2 = _mm_add_pd(fjx2,tx);
2013 fjy2 = _mm_add_pd(fjy2,ty);
2014 fjz2 = _mm_add_pd(fjz2,tz);
2018 gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2020 /* Inner loop uses 358 flops */
2023 if(jidx<j_index_end)
2027 j_coord_offsetA = DIM*jnrA;
2029 /* load j atom coordinates */
2030 gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2031 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2033 /* Calculate displacement vector */
2034 dx00 = _mm_sub_pd(ix0,jx0);
2035 dy00 = _mm_sub_pd(iy0,jy0);
2036 dz00 = _mm_sub_pd(iz0,jz0);
2037 dx01 = _mm_sub_pd(ix0,jx1);
2038 dy01 = _mm_sub_pd(iy0,jy1);
2039 dz01 = _mm_sub_pd(iz0,jz1);
2040 dx02 = _mm_sub_pd(ix0,jx2);
2041 dy02 = _mm_sub_pd(iy0,jy2);
2042 dz02 = _mm_sub_pd(iz0,jz2);
2043 dx10 = _mm_sub_pd(ix1,jx0);
2044 dy10 = _mm_sub_pd(iy1,jy0);
2045 dz10 = _mm_sub_pd(iz1,jz0);
2046 dx11 = _mm_sub_pd(ix1,jx1);
2047 dy11 = _mm_sub_pd(iy1,jy1);
2048 dz11 = _mm_sub_pd(iz1,jz1);
2049 dx12 = _mm_sub_pd(ix1,jx2);
2050 dy12 = _mm_sub_pd(iy1,jy2);
2051 dz12 = _mm_sub_pd(iz1,jz2);
2052 dx20 = _mm_sub_pd(ix2,jx0);
2053 dy20 = _mm_sub_pd(iy2,jy0);
2054 dz20 = _mm_sub_pd(iz2,jz0);
2055 dx21 = _mm_sub_pd(ix2,jx1);
2056 dy21 = _mm_sub_pd(iy2,jy1);
2057 dz21 = _mm_sub_pd(iz2,jz1);
2058 dx22 = _mm_sub_pd(ix2,jx2);
2059 dy22 = _mm_sub_pd(iy2,jy2);
2060 dz22 = _mm_sub_pd(iz2,jz2);
2062 /* Calculate squared distance and things based on it */
2063 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2064 rsq01 = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
2065 rsq02 = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
2066 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
2067 rsq11 = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2068 rsq12 = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2069 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
2070 rsq21 = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2071 rsq22 = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2073 rinv00 = gmx_mm_invsqrt_pd(rsq00);
2074 rinv01 = gmx_mm_invsqrt_pd(rsq01);
2075 rinv02 = gmx_mm_invsqrt_pd(rsq02);
2076 rinv10 = gmx_mm_invsqrt_pd(rsq10);
2077 rinv11 = gmx_mm_invsqrt_pd(rsq11);
2078 rinv12 = gmx_mm_invsqrt_pd(rsq12);
2079 rinv20 = gmx_mm_invsqrt_pd(rsq20);
2080 rinv21 = gmx_mm_invsqrt_pd(rsq21);
2081 rinv22 = gmx_mm_invsqrt_pd(rsq22);
2083 rinvsq00 = _mm_mul_pd(rinv00,rinv00);
2084 rinvsq01 = _mm_mul_pd(rinv01,rinv01);
2085 rinvsq02 = _mm_mul_pd(rinv02,rinv02);
2086 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
2087 rinvsq11 = _mm_mul_pd(rinv11,rinv11);
2088 rinvsq12 = _mm_mul_pd(rinv12,rinv12);
2089 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
2090 rinvsq21 = _mm_mul_pd(rinv21,rinv21);
2091 rinvsq22 = _mm_mul_pd(rinv22,rinv22);
2093 fjx0 = _mm_setzero_pd();
2094 fjy0 = _mm_setzero_pd();
2095 fjz0 = _mm_setzero_pd();
2096 fjx1 = _mm_setzero_pd();
2097 fjy1 = _mm_setzero_pd();
2098 fjz1 = _mm_setzero_pd();
2099 fjx2 = _mm_setzero_pd();
2100 fjy2 = _mm_setzero_pd();
2101 fjz2 = _mm_setzero_pd();
2103 /**************************
2104 * CALCULATE INTERACTIONS *
2105 **************************/
2107 if (gmx_mm_any_lt(rsq00,rcutoff2))
2110 r00 = _mm_mul_pd(rsq00,rinv00);
2112 /* EWALD ELECTROSTATICS */
2114 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2115 ewrt = _mm_mul_pd(r00,ewtabscale);
2116 ewitab = _mm_cvttpd_epi32(ewrt);
2117 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2118 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2119 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2120 felec = _mm_mul_pd(_mm_mul_pd(qq00,rinv00),_mm_sub_pd(rinvsq00,felec));
2122 /* LENNARD-JONES DISPERSION/REPULSION */
2124 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
2125 fvdw = _mm_mul_pd(_mm_sub_pd(_mm_mul_pd(c12_00,rinvsix),c6_00),_mm_mul_pd(rinvsix,rinvsq00));
2127 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
2129 fscal = _mm_add_pd(felec,fvdw);
2131 fscal = _mm_and_pd(fscal,cutoff_mask);
2133 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2135 /* Calculate temporary vectorial force */
2136 tx = _mm_mul_pd(fscal,dx00);
2137 ty = _mm_mul_pd(fscal,dy00);
2138 tz = _mm_mul_pd(fscal,dz00);
2140 /* Update vectorial force */
2141 fix0 = _mm_add_pd(fix0,tx);
2142 fiy0 = _mm_add_pd(fiy0,ty);
2143 fiz0 = _mm_add_pd(fiz0,tz);
2145 fjx0 = _mm_add_pd(fjx0,tx);
2146 fjy0 = _mm_add_pd(fjy0,ty);
2147 fjz0 = _mm_add_pd(fjz0,tz);
2151 /**************************
2152 * CALCULATE INTERACTIONS *
2153 **************************/
2155 if (gmx_mm_any_lt(rsq01,rcutoff2))
2158 r01 = _mm_mul_pd(rsq01,rinv01);
2160 /* EWALD ELECTROSTATICS */
2162 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2163 ewrt = _mm_mul_pd(r01,ewtabscale);
2164 ewitab = _mm_cvttpd_epi32(ewrt);
2165 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2166 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2167 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2168 felec = _mm_mul_pd(_mm_mul_pd(qq01,rinv01),_mm_sub_pd(rinvsq01,felec));
2170 cutoff_mask = _mm_cmplt_pd(rsq01,rcutoff2);
2174 fscal = _mm_and_pd(fscal,cutoff_mask);
2176 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2178 /* Calculate temporary vectorial force */
2179 tx = _mm_mul_pd(fscal,dx01);
2180 ty = _mm_mul_pd(fscal,dy01);
2181 tz = _mm_mul_pd(fscal,dz01);
2183 /* Update vectorial force */
2184 fix0 = _mm_add_pd(fix0,tx);
2185 fiy0 = _mm_add_pd(fiy0,ty);
2186 fiz0 = _mm_add_pd(fiz0,tz);
2188 fjx1 = _mm_add_pd(fjx1,tx);
2189 fjy1 = _mm_add_pd(fjy1,ty);
2190 fjz1 = _mm_add_pd(fjz1,tz);
2194 /**************************
2195 * CALCULATE INTERACTIONS *
2196 **************************/
2198 if (gmx_mm_any_lt(rsq02,rcutoff2))
2201 r02 = _mm_mul_pd(rsq02,rinv02);
2203 /* EWALD ELECTROSTATICS */
2205 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2206 ewrt = _mm_mul_pd(r02,ewtabscale);
2207 ewitab = _mm_cvttpd_epi32(ewrt);
2208 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2209 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2210 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2211 felec = _mm_mul_pd(_mm_mul_pd(qq02,rinv02),_mm_sub_pd(rinvsq02,felec));
2213 cutoff_mask = _mm_cmplt_pd(rsq02,rcutoff2);
2217 fscal = _mm_and_pd(fscal,cutoff_mask);
2219 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2221 /* Calculate temporary vectorial force */
2222 tx = _mm_mul_pd(fscal,dx02);
2223 ty = _mm_mul_pd(fscal,dy02);
2224 tz = _mm_mul_pd(fscal,dz02);
2226 /* Update vectorial force */
2227 fix0 = _mm_add_pd(fix0,tx);
2228 fiy0 = _mm_add_pd(fiy0,ty);
2229 fiz0 = _mm_add_pd(fiz0,tz);
2231 fjx2 = _mm_add_pd(fjx2,tx);
2232 fjy2 = _mm_add_pd(fjy2,ty);
2233 fjz2 = _mm_add_pd(fjz2,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 if (gmx_mm_any_lt(rsq10,rcutoff2))
2244 r10 = _mm_mul_pd(rsq10,rinv10);
2246 /* EWALD ELECTROSTATICS */
2248 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2249 ewrt = _mm_mul_pd(r10,ewtabscale);
2250 ewitab = _mm_cvttpd_epi32(ewrt);
2251 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2252 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2253 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2254 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
2256 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
2260 fscal = _mm_and_pd(fscal,cutoff_mask);
2262 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2264 /* Calculate temporary vectorial force */
2265 tx = _mm_mul_pd(fscal,dx10);
2266 ty = _mm_mul_pd(fscal,dy10);
2267 tz = _mm_mul_pd(fscal,dz10);
2269 /* Update vectorial force */
2270 fix1 = _mm_add_pd(fix1,tx);
2271 fiy1 = _mm_add_pd(fiy1,ty);
2272 fiz1 = _mm_add_pd(fiz1,tz);
2274 fjx0 = _mm_add_pd(fjx0,tx);
2275 fjy0 = _mm_add_pd(fjy0,ty);
2276 fjz0 = _mm_add_pd(fjz0,tz);
2280 /**************************
2281 * CALCULATE INTERACTIONS *
2282 **************************/
2284 if (gmx_mm_any_lt(rsq11,rcutoff2))
2287 r11 = _mm_mul_pd(rsq11,rinv11);
2289 /* EWALD ELECTROSTATICS */
2291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2292 ewrt = _mm_mul_pd(r11,ewtabscale);
2293 ewitab = _mm_cvttpd_epi32(ewrt);
2294 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2295 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2296 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2297 felec = _mm_mul_pd(_mm_mul_pd(qq11,rinv11),_mm_sub_pd(rinvsq11,felec));
2299 cutoff_mask = _mm_cmplt_pd(rsq11,rcutoff2);
2303 fscal = _mm_and_pd(fscal,cutoff_mask);
2305 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2307 /* Calculate temporary vectorial force */
2308 tx = _mm_mul_pd(fscal,dx11);
2309 ty = _mm_mul_pd(fscal,dy11);
2310 tz = _mm_mul_pd(fscal,dz11);
2312 /* Update vectorial force */
2313 fix1 = _mm_add_pd(fix1,tx);
2314 fiy1 = _mm_add_pd(fiy1,ty);
2315 fiz1 = _mm_add_pd(fiz1,tz);
2317 fjx1 = _mm_add_pd(fjx1,tx);
2318 fjy1 = _mm_add_pd(fjy1,ty);
2319 fjz1 = _mm_add_pd(fjz1,tz);
2323 /**************************
2324 * CALCULATE INTERACTIONS *
2325 **************************/
2327 if (gmx_mm_any_lt(rsq12,rcutoff2))
2330 r12 = _mm_mul_pd(rsq12,rinv12);
2332 /* EWALD ELECTROSTATICS */
2334 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335 ewrt = _mm_mul_pd(r12,ewtabscale);
2336 ewitab = _mm_cvttpd_epi32(ewrt);
2337 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2338 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2339 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2340 felec = _mm_mul_pd(_mm_mul_pd(qq12,rinv12),_mm_sub_pd(rinvsq12,felec));
2342 cutoff_mask = _mm_cmplt_pd(rsq12,rcutoff2);
2346 fscal = _mm_and_pd(fscal,cutoff_mask);
2348 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2350 /* Calculate temporary vectorial force */
2351 tx = _mm_mul_pd(fscal,dx12);
2352 ty = _mm_mul_pd(fscal,dy12);
2353 tz = _mm_mul_pd(fscal,dz12);
2355 /* Update vectorial force */
2356 fix1 = _mm_add_pd(fix1,tx);
2357 fiy1 = _mm_add_pd(fiy1,ty);
2358 fiz1 = _mm_add_pd(fiz1,tz);
2360 fjx2 = _mm_add_pd(fjx2,tx);
2361 fjy2 = _mm_add_pd(fjy2,ty);
2362 fjz2 = _mm_add_pd(fjz2,tz);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 if (gmx_mm_any_lt(rsq20,rcutoff2))
2373 r20 = _mm_mul_pd(rsq20,rinv20);
2375 /* EWALD ELECTROSTATICS */
2377 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2378 ewrt = _mm_mul_pd(r20,ewtabscale);
2379 ewitab = _mm_cvttpd_epi32(ewrt);
2380 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2381 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2382 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2383 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
2385 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
2389 fscal = _mm_and_pd(fscal,cutoff_mask);
2391 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2393 /* Calculate temporary vectorial force */
2394 tx = _mm_mul_pd(fscal,dx20);
2395 ty = _mm_mul_pd(fscal,dy20);
2396 tz = _mm_mul_pd(fscal,dz20);
2398 /* Update vectorial force */
2399 fix2 = _mm_add_pd(fix2,tx);
2400 fiy2 = _mm_add_pd(fiy2,ty);
2401 fiz2 = _mm_add_pd(fiz2,tz);
2403 fjx0 = _mm_add_pd(fjx0,tx);
2404 fjy0 = _mm_add_pd(fjy0,ty);
2405 fjz0 = _mm_add_pd(fjz0,tz);
2409 /**************************
2410 * CALCULATE INTERACTIONS *
2411 **************************/
2413 if (gmx_mm_any_lt(rsq21,rcutoff2))
2416 r21 = _mm_mul_pd(rsq21,rinv21);
2418 /* EWALD ELECTROSTATICS */
2420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2421 ewrt = _mm_mul_pd(r21,ewtabscale);
2422 ewitab = _mm_cvttpd_epi32(ewrt);
2423 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2424 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2425 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2426 felec = _mm_mul_pd(_mm_mul_pd(qq21,rinv21),_mm_sub_pd(rinvsq21,felec));
2428 cutoff_mask = _mm_cmplt_pd(rsq21,rcutoff2);
2432 fscal = _mm_and_pd(fscal,cutoff_mask);
2434 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2436 /* Calculate temporary vectorial force */
2437 tx = _mm_mul_pd(fscal,dx21);
2438 ty = _mm_mul_pd(fscal,dy21);
2439 tz = _mm_mul_pd(fscal,dz21);
2441 /* Update vectorial force */
2442 fix2 = _mm_add_pd(fix2,tx);
2443 fiy2 = _mm_add_pd(fiy2,ty);
2444 fiz2 = _mm_add_pd(fiz2,tz);
2446 fjx1 = _mm_add_pd(fjx1,tx);
2447 fjy1 = _mm_add_pd(fjy1,ty);
2448 fjz1 = _mm_add_pd(fjz1,tz);
2452 /**************************
2453 * CALCULATE INTERACTIONS *
2454 **************************/
2456 if (gmx_mm_any_lt(rsq22,rcutoff2))
2459 r22 = _mm_mul_pd(rsq22,rinv22);
2461 /* EWALD ELECTROSTATICS */
2463 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464 ewrt = _mm_mul_pd(r22,ewtabscale);
2465 ewitab = _mm_cvttpd_epi32(ewrt);
2466 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
2467 gmx_mm_load_1pair_swizzle_pd(ewtab+gmx_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
2468 felec = _mm_add_pd(_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF),_mm_mul_pd(eweps,ewtabFn));
2469 felec = _mm_mul_pd(_mm_mul_pd(qq22,rinv22),_mm_sub_pd(rinvsq22,felec));
2471 cutoff_mask = _mm_cmplt_pd(rsq22,rcutoff2);
2475 fscal = _mm_and_pd(fscal,cutoff_mask);
2477 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2479 /* Calculate temporary vectorial force */
2480 tx = _mm_mul_pd(fscal,dx22);
2481 ty = _mm_mul_pd(fscal,dy22);
2482 tz = _mm_mul_pd(fscal,dz22);
2484 /* Update vectorial force */
2485 fix2 = _mm_add_pd(fix2,tx);
2486 fiy2 = _mm_add_pd(fiy2,ty);
2487 fiz2 = _mm_add_pd(fiz2,tz);
2489 fjx2 = _mm_add_pd(fjx2,tx);
2490 fjy2 = _mm_add_pd(fjy2,ty);
2491 fjz2 = _mm_add_pd(fjz2,tz);
2495 gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2497 /* Inner loop uses 358 flops */
2500 /* End of innermost loop */
2502 gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2503 f+i_coord_offset,fshift+i_shift_offset);
2505 /* Increment number of inner iterations */
2506 inneriter += j_index_end - j_index_start;
2508 /* Outer loop uses 18 flops */
2511 /* Increment number of outer iterations */
2514 /* Update outer/inner flops */
2516 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*358);