2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
98 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
102 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
104 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
106 __m128 dummy_mask,cutoff_mask;
107 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
108 __m128 one = _mm_set1_ps(1.0);
109 __m128 two = _mm_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
128 ewtab = fr->ic->tabq_coul_FDV0;
129 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
130 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
132 /* Setup water-specific parameters */
133 inr = nlist->iinr[0];
134 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
135 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
136 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
137 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
139 jq1 = _mm_set1_ps(charge[inr+1]);
140 jq2 = _mm_set1_ps(charge[inr+2]);
141 jq3 = _mm_set1_ps(charge[inr+3]);
142 vdwjidx0A = 2*vdwtype[inr+0];
143 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
144 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
145 qq11 = _mm_mul_ps(iq1,jq1);
146 qq12 = _mm_mul_ps(iq1,jq2);
147 qq13 = _mm_mul_ps(iq1,jq3);
148 qq21 = _mm_mul_ps(iq2,jq1);
149 qq22 = _mm_mul_ps(iq2,jq2);
150 qq23 = _mm_mul_ps(iq2,jq3);
151 qq31 = _mm_mul_ps(iq3,jq1);
152 qq32 = _mm_mul_ps(iq3,jq2);
153 qq33 = _mm_mul_ps(iq3,jq3);
155 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
156 rcutoff_scalar = fr->rcoulomb;
157 rcutoff = _mm_set1_ps(rcutoff_scalar);
158 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
160 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
161 rvdw = _mm_set1_ps(fr->rvdw);
163 /* Avoid stupid compiler warnings */
164 jnrA = jnrB = jnrC = jnrD = 0;
173 for(iidx=0;iidx<4*DIM;iidx++)
178 /* Start outer loop over neighborlists */
179 for(iidx=0; iidx<nri; iidx++)
181 /* Load shift vector for this list */
182 i_shift_offset = DIM*shiftidx[iidx];
184 /* Load limits for loop over neighbors */
185 j_index_start = jindex[iidx];
186 j_index_end = jindex[iidx+1];
188 /* Get outer coordinate index */
190 i_coord_offset = DIM*inr;
192 /* Load i particle coords and add shift vector */
193 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
196 fix0 = _mm_setzero_ps();
197 fiy0 = _mm_setzero_ps();
198 fiz0 = _mm_setzero_ps();
199 fix1 = _mm_setzero_ps();
200 fiy1 = _mm_setzero_ps();
201 fiz1 = _mm_setzero_ps();
202 fix2 = _mm_setzero_ps();
203 fiy2 = _mm_setzero_ps();
204 fiz2 = _mm_setzero_ps();
205 fix3 = _mm_setzero_ps();
206 fiy3 = _mm_setzero_ps();
207 fiz3 = _mm_setzero_ps();
209 /* Reset potential sums */
210 velecsum = _mm_setzero_ps();
211 vvdwsum = _mm_setzero_ps();
213 /* Start inner kernel loop */
214 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
217 /* Get j neighbor index, and coordinate index */
222 j_coord_offsetA = DIM*jnrA;
223 j_coord_offsetB = DIM*jnrB;
224 j_coord_offsetC = DIM*jnrC;
225 j_coord_offsetD = DIM*jnrD;
227 /* load j atom coordinates */
228 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
229 x+j_coord_offsetC,x+j_coord_offsetD,
230 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
231 &jy2,&jz2,&jx3,&jy3,&jz3);
233 /* Calculate displacement vector */
234 dx00 = _mm_sub_ps(ix0,jx0);
235 dy00 = _mm_sub_ps(iy0,jy0);
236 dz00 = _mm_sub_ps(iz0,jz0);
237 dx11 = _mm_sub_ps(ix1,jx1);
238 dy11 = _mm_sub_ps(iy1,jy1);
239 dz11 = _mm_sub_ps(iz1,jz1);
240 dx12 = _mm_sub_ps(ix1,jx2);
241 dy12 = _mm_sub_ps(iy1,jy2);
242 dz12 = _mm_sub_ps(iz1,jz2);
243 dx13 = _mm_sub_ps(ix1,jx3);
244 dy13 = _mm_sub_ps(iy1,jy3);
245 dz13 = _mm_sub_ps(iz1,jz3);
246 dx21 = _mm_sub_ps(ix2,jx1);
247 dy21 = _mm_sub_ps(iy2,jy1);
248 dz21 = _mm_sub_ps(iz2,jz1);
249 dx22 = _mm_sub_ps(ix2,jx2);
250 dy22 = _mm_sub_ps(iy2,jy2);
251 dz22 = _mm_sub_ps(iz2,jz2);
252 dx23 = _mm_sub_ps(ix2,jx3);
253 dy23 = _mm_sub_ps(iy2,jy3);
254 dz23 = _mm_sub_ps(iz2,jz3);
255 dx31 = _mm_sub_ps(ix3,jx1);
256 dy31 = _mm_sub_ps(iy3,jy1);
257 dz31 = _mm_sub_ps(iz3,jz1);
258 dx32 = _mm_sub_ps(ix3,jx2);
259 dy32 = _mm_sub_ps(iy3,jy2);
260 dz32 = _mm_sub_ps(iz3,jz2);
261 dx33 = _mm_sub_ps(ix3,jx3);
262 dy33 = _mm_sub_ps(iy3,jy3);
263 dz33 = _mm_sub_ps(iz3,jz3);
265 /* Calculate squared distance and things based on it */
266 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
267 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
269 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
270 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
272 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
273 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
274 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
275 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
277 rinv11 = gmx_mm_invsqrt_ps(rsq11);
278 rinv12 = gmx_mm_invsqrt_ps(rsq12);
279 rinv13 = gmx_mm_invsqrt_ps(rsq13);
280 rinv21 = gmx_mm_invsqrt_ps(rsq21);
281 rinv22 = gmx_mm_invsqrt_ps(rsq22);
282 rinv23 = gmx_mm_invsqrt_ps(rsq23);
283 rinv31 = gmx_mm_invsqrt_ps(rsq31);
284 rinv32 = gmx_mm_invsqrt_ps(rsq32);
285 rinv33 = gmx_mm_invsqrt_ps(rsq33);
287 rinvsq00 = gmx_mm_inv_ps(rsq00);
288 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
289 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
290 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
291 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
292 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
293 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
294 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
295 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
296 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
298 fjx0 = _mm_setzero_ps();
299 fjy0 = _mm_setzero_ps();
300 fjz0 = _mm_setzero_ps();
301 fjx1 = _mm_setzero_ps();
302 fjy1 = _mm_setzero_ps();
303 fjz1 = _mm_setzero_ps();
304 fjx2 = _mm_setzero_ps();
305 fjy2 = _mm_setzero_ps();
306 fjz2 = _mm_setzero_ps();
307 fjx3 = _mm_setzero_ps();
308 fjy3 = _mm_setzero_ps();
309 fjz3 = _mm_setzero_ps();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 if (gmx_mm_any_lt(rsq00,rcutoff2))
318 /* LENNARD-JONES DISPERSION/REPULSION */
320 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
321 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
322 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
323 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
324 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
325 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
327 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
329 /* Update potential sum for this i atom from the interaction with this j atom. */
330 vvdw = _mm_and_ps(vvdw,cutoff_mask);
331 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
335 fscal = _mm_and_ps(fscal,cutoff_mask);
337 /* Calculate temporary vectorial force */
338 tx = _mm_mul_ps(fscal,dx00);
339 ty = _mm_mul_ps(fscal,dy00);
340 tz = _mm_mul_ps(fscal,dz00);
342 /* Update vectorial force */
343 fix0 = _mm_add_ps(fix0,tx);
344 fiy0 = _mm_add_ps(fiy0,ty);
345 fiz0 = _mm_add_ps(fiz0,tz);
347 fjx0 = _mm_add_ps(fjx0,tx);
348 fjy0 = _mm_add_ps(fjy0,ty);
349 fjz0 = _mm_add_ps(fjz0,tz);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 if (gmx_mm_any_lt(rsq11,rcutoff2))
360 r11 = _mm_mul_ps(rsq11,rinv11);
362 /* EWALD ELECTROSTATICS */
364 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
365 ewrt = _mm_mul_ps(r11,ewtabscale);
366 ewitab = _mm_cvttps_epi32(ewrt);
367 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
368 ewitab = _mm_slli_epi32(ewitab,2);
369 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
370 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
371 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
372 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
373 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
374 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
375 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
376 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
377 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
379 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velec = _mm_and_ps(velec,cutoff_mask);
383 velecsum = _mm_add_ps(velecsum,velec);
387 fscal = _mm_and_ps(fscal,cutoff_mask);
389 /* Calculate temporary vectorial force */
390 tx = _mm_mul_ps(fscal,dx11);
391 ty = _mm_mul_ps(fscal,dy11);
392 tz = _mm_mul_ps(fscal,dz11);
394 /* Update vectorial force */
395 fix1 = _mm_add_ps(fix1,tx);
396 fiy1 = _mm_add_ps(fiy1,ty);
397 fiz1 = _mm_add_ps(fiz1,tz);
399 fjx1 = _mm_add_ps(fjx1,tx);
400 fjy1 = _mm_add_ps(fjy1,ty);
401 fjz1 = _mm_add_ps(fjz1,tz);
405 /**************************
406 * CALCULATE INTERACTIONS *
407 **************************/
409 if (gmx_mm_any_lt(rsq12,rcutoff2))
412 r12 = _mm_mul_ps(rsq12,rinv12);
414 /* EWALD ELECTROSTATICS */
416 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
417 ewrt = _mm_mul_ps(r12,ewtabscale);
418 ewitab = _mm_cvttps_epi32(ewrt);
419 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
420 ewitab = _mm_slli_epi32(ewitab,2);
421 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
422 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
423 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
424 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
425 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
426 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
427 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
428 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
429 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
431 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velec = _mm_and_ps(velec,cutoff_mask);
435 velecsum = _mm_add_ps(velecsum,velec);
439 fscal = _mm_and_ps(fscal,cutoff_mask);
441 /* Calculate temporary vectorial force */
442 tx = _mm_mul_ps(fscal,dx12);
443 ty = _mm_mul_ps(fscal,dy12);
444 tz = _mm_mul_ps(fscal,dz12);
446 /* Update vectorial force */
447 fix1 = _mm_add_ps(fix1,tx);
448 fiy1 = _mm_add_ps(fiy1,ty);
449 fiz1 = _mm_add_ps(fiz1,tz);
451 fjx2 = _mm_add_ps(fjx2,tx);
452 fjy2 = _mm_add_ps(fjy2,ty);
453 fjz2 = _mm_add_ps(fjz2,tz);
457 /**************************
458 * CALCULATE INTERACTIONS *
459 **************************/
461 if (gmx_mm_any_lt(rsq13,rcutoff2))
464 r13 = _mm_mul_ps(rsq13,rinv13);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm_mul_ps(r13,ewtabscale);
470 ewitab = _mm_cvttps_epi32(ewrt);
471 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
481 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
483 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velec = _mm_and_ps(velec,cutoff_mask);
487 velecsum = _mm_add_ps(velecsum,velec);
491 fscal = _mm_and_ps(fscal,cutoff_mask);
493 /* Calculate temporary vectorial force */
494 tx = _mm_mul_ps(fscal,dx13);
495 ty = _mm_mul_ps(fscal,dy13);
496 tz = _mm_mul_ps(fscal,dz13);
498 /* Update vectorial force */
499 fix1 = _mm_add_ps(fix1,tx);
500 fiy1 = _mm_add_ps(fiy1,ty);
501 fiz1 = _mm_add_ps(fiz1,tz);
503 fjx3 = _mm_add_ps(fjx3,tx);
504 fjy3 = _mm_add_ps(fjy3,ty);
505 fjz3 = _mm_add_ps(fjz3,tz);
509 /**************************
510 * CALCULATE INTERACTIONS *
511 **************************/
513 if (gmx_mm_any_lt(rsq21,rcutoff2))
516 r21 = _mm_mul_ps(rsq21,rinv21);
518 /* EWALD ELECTROSTATICS */
520 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521 ewrt = _mm_mul_ps(r21,ewtabscale);
522 ewitab = _mm_cvttps_epi32(ewrt);
523 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
524 ewitab = _mm_slli_epi32(ewitab,2);
525 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
526 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
527 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
528 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
529 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
530 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
531 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
532 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
533 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
535 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velec = _mm_and_ps(velec,cutoff_mask);
539 velecsum = _mm_add_ps(velecsum,velec);
543 fscal = _mm_and_ps(fscal,cutoff_mask);
545 /* Calculate temporary vectorial force */
546 tx = _mm_mul_ps(fscal,dx21);
547 ty = _mm_mul_ps(fscal,dy21);
548 tz = _mm_mul_ps(fscal,dz21);
550 /* Update vectorial force */
551 fix2 = _mm_add_ps(fix2,tx);
552 fiy2 = _mm_add_ps(fiy2,ty);
553 fiz2 = _mm_add_ps(fiz2,tz);
555 fjx1 = _mm_add_ps(fjx1,tx);
556 fjy1 = _mm_add_ps(fjy1,ty);
557 fjz1 = _mm_add_ps(fjz1,tz);
561 /**************************
562 * CALCULATE INTERACTIONS *
563 **************************/
565 if (gmx_mm_any_lt(rsq22,rcutoff2))
568 r22 = _mm_mul_ps(rsq22,rinv22);
570 /* EWALD ELECTROSTATICS */
572 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
573 ewrt = _mm_mul_ps(r22,ewtabscale);
574 ewitab = _mm_cvttps_epi32(ewrt);
575 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
576 ewitab = _mm_slli_epi32(ewitab,2);
577 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
578 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
579 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
580 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
581 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
582 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
583 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
584 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
585 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
587 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velec = _mm_and_ps(velec,cutoff_mask);
591 velecsum = _mm_add_ps(velecsum,velec);
595 fscal = _mm_and_ps(fscal,cutoff_mask);
597 /* Calculate temporary vectorial force */
598 tx = _mm_mul_ps(fscal,dx22);
599 ty = _mm_mul_ps(fscal,dy22);
600 tz = _mm_mul_ps(fscal,dz22);
602 /* Update vectorial force */
603 fix2 = _mm_add_ps(fix2,tx);
604 fiy2 = _mm_add_ps(fiy2,ty);
605 fiz2 = _mm_add_ps(fiz2,tz);
607 fjx2 = _mm_add_ps(fjx2,tx);
608 fjy2 = _mm_add_ps(fjy2,ty);
609 fjz2 = _mm_add_ps(fjz2,tz);
613 /**************************
614 * CALCULATE INTERACTIONS *
615 **************************/
617 if (gmx_mm_any_lt(rsq23,rcutoff2))
620 r23 = _mm_mul_ps(rsq23,rinv23);
622 /* EWALD ELECTROSTATICS */
624 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
625 ewrt = _mm_mul_ps(r23,ewtabscale);
626 ewitab = _mm_cvttps_epi32(ewrt);
627 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
628 ewitab = _mm_slli_epi32(ewitab,2);
629 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
630 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
631 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
632 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
633 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
634 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
635 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
636 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
637 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
639 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
641 /* Update potential sum for this i atom from the interaction with this j atom. */
642 velec = _mm_and_ps(velec,cutoff_mask);
643 velecsum = _mm_add_ps(velecsum,velec);
647 fscal = _mm_and_ps(fscal,cutoff_mask);
649 /* Calculate temporary vectorial force */
650 tx = _mm_mul_ps(fscal,dx23);
651 ty = _mm_mul_ps(fscal,dy23);
652 tz = _mm_mul_ps(fscal,dz23);
654 /* Update vectorial force */
655 fix2 = _mm_add_ps(fix2,tx);
656 fiy2 = _mm_add_ps(fiy2,ty);
657 fiz2 = _mm_add_ps(fiz2,tz);
659 fjx3 = _mm_add_ps(fjx3,tx);
660 fjy3 = _mm_add_ps(fjy3,ty);
661 fjz3 = _mm_add_ps(fjz3,tz);
665 /**************************
666 * CALCULATE INTERACTIONS *
667 **************************/
669 if (gmx_mm_any_lt(rsq31,rcutoff2))
672 r31 = _mm_mul_ps(rsq31,rinv31);
674 /* EWALD ELECTROSTATICS */
676 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
677 ewrt = _mm_mul_ps(r31,ewtabscale);
678 ewitab = _mm_cvttps_epi32(ewrt);
679 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
680 ewitab = _mm_slli_epi32(ewitab,2);
681 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
682 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
683 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
684 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
685 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
686 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
687 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
688 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
689 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
691 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velec = _mm_and_ps(velec,cutoff_mask);
695 velecsum = _mm_add_ps(velecsum,velec);
699 fscal = _mm_and_ps(fscal,cutoff_mask);
701 /* Calculate temporary vectorial force */
702 tx = _mm_mul_ps(fscal,dx31);
703 ty = _mm_mul_ps(fscal,dy31);
704 tz = _mm_mul_ps(fscal,dz31);
706 /* Update vectorial force */
707 fix3 = _mm_add_ps(fix3,tx);
708 fiy3 = _mm_add_ps(fiy3,ty);
709 fiz3 = _mm_add_ps(fiz3,tz);
711 fjx1 = _mm_add_ps(fjx1,tx);
712 fjy1 = _mm_add_ps(fjy1,ty);
713 fjz1 = _mm_add_ps(fjz1,tz);
717 /**************************
718 * CALCULATE INTERACTIONS *
719 **************************/
721 if (gmx_mm_any_lt(rsq32,rcutoff2))
724 r32 = _mm_mul_ps(rsq32,rinv32);
726 /* EWALD ELECTROSTATICS */
728 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
729 ewrt = _mm_mul_ps(r32,ewtabscale);
730 ewitab = _mm_cvttps_epi32(ewrt);
731 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
732 ewitab = _mm_slli_epi32(ewitab,2);
733 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
734 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
735 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
736 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
737 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
738 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
739 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
740 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
741 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
743 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velec = _mm_and_ps(velec,cutoff_mask);
747 velecsum = _mm_add_ps(velecsum,velec);
751 fscal = _mm_and_ps(fscal,cutoff_mask);
753 /* Calculate temporary vectorial force */
754 tx = _mm_mul_ps(fscal,dx32);
755 ty = _mm_mul_ps(fscal,dy32);
756 tz = _mm_mul_ps(fscal,dz32);
758 /* Update vectorial force */
759 fix3 = _mm_add_ps(fix3,tx);
760 fiy3 = _mm_add_ps(fiy3,ty);
761 fiz3 = _mm_add_ps(fiz3,tz);
763 fjx2 = _mm_add_ps(fjx2,tx);
764 fjy2 = _mm_add_ps(fjy2,ty);
765 fjz2 = _mm_add_ps(fjz2,tz);
769 /**************************
770 * CALCULATE INTERACTIONS *
771 **************************/
773 if (gmx_mm_any_lt(rsq33,rcutoff2))
776 r33 = _mm_mul_ps(rsq33,rinv33);
778 /* EWALD ELECTROSTATICS */
780 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
781 ewrt = _mm_mul_ps(r33,ewtabscale);
782 ewitab = _mm_cvttps_epi32(ewrt);
783 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
784 ewitab = _mm_slli_epi32(ewitab,2);
785 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
786 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
787 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
788 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
789 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
790 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
791 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
792 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
793 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
795 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
797 /* Update potential sum for this i atom from the interaction with this j atom. */
798 velec = _mm_and_ps(velec,cutoff_mask);
799 velecsum = _mm_add_ps(velecsum,velec);
803 fscal = _mm_and_ps(fscal,cutoff_mask);
805 /* Calculate temporary vectorial force */
806 tx = _mm_mul_ps(fscal,dx33);
807 ty = _mm_mul_ps(fscal,dy33);
808 tz = _mm_mul_ps(fscal,dz33);
810 /* Update vectorial force */
811 fix3 = _mm_add_ps(fix3,tx);
812 fiy3 = _mm_add_ps(fiy3,ty);
813 fiz3 = _mm_add_ps(fiz3,tz);
815 fjx3 = _mm_add_ps(fjx3,tx);
816 fjy3 = _mm_add_ps(fjy3,ty);
817 fjz3 = _mm_add_ps(fjz3,tz);
821 fjptrA = f+j_coord_offsetA;
822 fjptrB = f+j_coord_offsetB;
823 fjptrC = f+j_coord_offsetC;
824 fjptrD = f+j_coord_offsetD;
826 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
827 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
828 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
830 /* Inner loop uses 458 flops */
836 /* Get j neighbor index, and coordinate index */
837 jnrlistA = jjnr[jidx];
838 jnrlistB = jjnr[jidx+1];
839 jnrlistC = jjnr[jidx+2];
840 jnrlistD = jjnr[jidx+3];
841 /* Sign of each element will be negative for non-real atoms.
842 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
843 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
845 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
846 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
847 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
848 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
849 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
850 j_coord_offsetA = DIM*jnrA;
851 j_coord_offsetB = DIM*jnrB;
852 j_coord_offsetC = DIM*jnrC;
853 j_coord_offsetD = DIM*jnrD;
855 /* load j atom coordinates */
856 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
857 x+j_coord_offsetC,x+j_coord_offsetD,
858 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
859 &jy2,&jz2,&jx3,&jy3,&jz3);
861 /* Calculate displacement vector */
862 dx00 = _mm_sub_ps(ix0,jx0);
863 dy00 = _mm_sub_ps(iy0,jy0);
864 dz00 = _mm_sub_ps(iz0,jz0);
865 dx11 = _mm_sub_ps(ix1,jx1);
866 dy11 = _mm_sub_ps(iy1,jy1);
867 dz11 = _mm_sub_ps(iz1,jz1);
868 dx12 = _mm_sub_ps(ix1,jx2);
869 dy12 = _mm_sub_ps(iy1,jy2);
870 dz12 = _mm_sub_ps(iz1,jz2);
871 dx13 = _mm_sub_ps(ix1,jx3);
872 dy13 = _mm_sub_ps(iy1,jy3);
873 dz13 = _mm_sub_ps(iz1,jz3);
874 dx21 = _mm_sub_ps(ix2,jx1);
875 dy21 = _mm_sub_ps(iy2,jy1);
876 dz21 = _mm_sub_ps(iz2,jz1);
877 dx22 = _mm_sub_ps(ix2,jx2);
878 dy22 = _mm_sub_ps(iy2,jy2);
879 dz22 = _mm_sub_ps(iz2,jz2);
880 dx23 = _mm_sub_ps(ix2,jx3);
881 dy23 = _mm_sub_ps(iy2,jy3);
882 dz23 = _mm_sub_ps(iz2,jz3);
883 dx31 = _mm_sub_ps(ix3,jx1);
884 dy31 = _mm_sub_ps(iy3,jy1);
885 dz31 = _mm_sub_ps(iz3,jz1);
886 dx32 = _mm_sub_ps(ix3,jx2);
887 dy32 = _mm_sub_ps(iy3,jy2);
888 dz32 = _mm_sub_ps(iz3,jz2);
889 dx33 = _mm_sub_ps(ix3,jx3);
890 dy33 = _mm_sub_ps(iy3,jy3);
891 dz33 = _mm_sub_ps(iz3,jz3);
893 /* Calculate squared distance and things based on it */
894 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
895 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
896 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
897 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
898 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
899 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
900 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
901 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
902 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
903 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
905 rinv11 = gmx_mm_invsqrt_ps(rsq11);
906 rinv12 = gmx_mm_invsqrt_ps(rsq12);
907 rinv13 = gmx_mm_invsqrt_ps(rsq13);
908 rinv21 = gmx_mm_invsqrt_ps(rsq21);
909 rinv22 = gmx_mm_invsqrt_ps(rsq22);
910 rinv23 = gmx_mm_invsqrt_ps(rsq23);
911 rinv31 = gmx_mm_invsqrt_ps(rsq31);
912 rinv32 = gmx_mm_invsqrt_ps(rsq32);
913 rinv33 = gmx_mm_invsqrt_ps(rsq33);
915 rinvsq00 = gmx_mm_inv_ps(rsq00);
916 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
917 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
918 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
919 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
920 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
921 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
922 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
923 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
924 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
926 fjx0 = _mm_setzero_ps();
927 fjy0 = _mm_setzero_ps();
928 fjz0 = _mm_setzero_ps();
929 fjx1 = _mm_setzero_ps();
930 fjy1 = _mm_setzero_ps();
931 fjz1 = _mm_setzero_ps();
932 fjx2 = _mm_setzero_ps();
933 fjy2 = _mm_setzero_ps();
934 fjz2 = _mm_setzero_ps();
935 fjx3 = _mm_setzero_ps();
936 fjy3 = _mm_setzero_ps();
937 fjz3 = _mm_setzero_ps();
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 if (gmx_mm_any_lt(rsq00,rcutoff2))
946 /* LENNARD-JONES DISPERSION/REPULSION */
948 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
949 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
950 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
951 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
952 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
953 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
955 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
957 /* Update potential sum for this i atom from the interaction with this j atom. */
958 vvdw = _mm_and_ps(vvdw,cutoff_mask);
959 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
960 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
964 fscal = _mm_and_ps(fscal,cutoff_mask);
966 fscal = _mm_andnot_ps(dummy_mask,fscal);
968 /* Calculate temporary vectorial force */
969 tx = _mm_mul_ps(fscal,dx00);
970 ty = _mm_mul_ps(fscal,dy00);
971 tz = _mm_mul_ps(fscal,dz00);
973 /* Update vectorial force */
974 fix0 = _mm_add_ps(fix0,tx);
975 fiy0 = _mm_add_ps(fiy0,ty);
976 fiz0 = _mm_add_ps(fiz0,tz);
978 fjx0 = _mm_add_ps(fjx0,tx);
979 fjy0 = _mm_add_ps(fjy0,ty);
980 fjz0 = _mm_add_ps(fjz0,tz);
984 /**************************
985 * CALCULATE INTERACTIONS *
986 **************************/
988 if (gmx_mm_any_lt(rsq11,rcutoff2))
991 r11 = _mm_mul_ps(rsq11,rinv11);
992 r11 = _mm_andnot_ps(dummy_mask,r11);
994 /* EWALD ELECTROSTATICS */
996 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
997 ewrt = _mm_mul_ps(r11,ewtabscale);
998 ewitab = _mm_cvttps_epi32(ewrt);
999 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1000 ewitab = _mm_slli_epi32(ewitab,2);
1001 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1002 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1003 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1004 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1005 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1006 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1007 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1008 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1009 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1011 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1013 /* Update potential sum for this i atom from the interaction with this j atom. */
1014 velec = _mm_and_ps(velec,cutoff_mask);
1015 velec = _mm_andnot_ps(dummy_mask,velec);
1016 velecsum = _mm_add_ps(velecsum,velec);
1020 fscal = _mm_and_ps(fscal,cutoff_mask);
1022 fscal = _mm_andnot_ps(dummy_mask,fscal);
1024 /* Calculate temporary vectorial force */
1025 tx = _mm_mul_ps(fscal,dx11);
1026 ty = _mm_mul_ps(fscal,dy11);
1027 tz = _mm_mul_ps(fscal,dz11);
1029 /* Update vectorial force */
1030 fix1 = _mm_add_ps(fix1,tx);
1031 fiy1 = _mm_add_ps(fiy1,ty);
1032 fiz1 = _mm_add_ps(fiz1,tz);
1034 fjx1 = _mm_add_ps(fjx1,tx);
1035 fjy1 = _mm_add_ps(fjy1,ty);
1036 fjz1 = _mm_add_ps(fjz1,tz);
1040 /**************************
1041 * CALCULATE INTERACTIONS *
1042 **************************/
1044 if (gmx_mm_any_lt(rsq12,rcutoff2))
1047 r12 = _mm_mul_ps(rsq12,rinv12);
1048 r12 = _mm_andnot_ps(dummy_mask,r12);
1050 /* EWALD ELECTROSTATICS */
1052 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1053 ewrt = _mm_mul_ps(r12,ewtabscale);
1054 ewitab = _mm_cvttps_epi32(ewrt);
1055 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1056 ewitab = _mm_slli_epi32(ewitab,2);
1057 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1058 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1059 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1060 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1061 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1062 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1063 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1064 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1065 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1067 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1069 /* Update potential sum for this i atom from the interaction with this j atom. */
1070 velec = _mm_and_ps(velec,cutoff_mask);
1071 velec = _mm_andnot_ps(dummy_mask,velec);
1072 velecsum = _mm_add_ps(velecsum,velec);
1076 fscal = _mm_and_ps(fscal,cutoff_mask);
1078 fscal = _mm_andnot_ps(dummy_mask,fscal);
1080 /* Calculate temporary vectorial force */
1081 tx = _mm_mul_ps(fscal,dx12);
1082 ty = _mm_mul_ps(fscal,dy12);
1083 tz = _mm_mul_ps(fscal,dz12);
1085 /* Update vectorial force */
1086 fix1 = _mm_add_ps(fix1,tx);
1087 fiy1 = _mm_add_ps(fiy1,ty);
1088 fiz1 = _mm_add_ps(fiz1,tz);
1090 fjx2 = _mm_add_ps(fjx2,tx);
1091 fjy2 = _mm_add_ps(fjy2,ty);
1092 fjz2 = _mm_add_ps(fjz2,tz);
1096 /**************************
1097 * CALCULATE INTERACTIONS *
1098 **************************/
1100 if (gmx_mm_any_lt(rsq13,rcutoff2))
1103 r13 = _mm_mul_ps(rsq13,rinv13);
1104 r13 = _mm_andnot_ps(dummy_mask,r13);
1106 /* EWALD ELECTROSTATICS */
1108 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1109 ewrt = _mm_mul_ps(r13,ewtabscale);
1110 ewitab = _mm_cvttps_epi32(ewrt);
1111 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1112 ewitab = _mm_slli_epi32(ewitab,2);
1113 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1114 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1115 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1116 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1117 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1118 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1119 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1120 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
1121 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1123 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1125 /* Update potential sum for this i atom from the interaction with this j atom. */
1126 velec = _mm_and_ps(velec,cutoff_mask);
1127 velec = _mm_andnot_ps(dummy_mask,velec);
1128 velecsum = _mm_add_ps(velecsum,velec);
1132 fscal = _mm_and_ps(fscal,cutoff_mask);
1134 fscal = _mm_andnot_ps(dummy_mask,fscal);
1136 /* Calculate temporary vectorial force */
1137 tx = _mm_mul_ps(fscal,dx13);
1138 ty = _mm_mul_ps(fscal,dy13);
1139 tz = _mm_mul_ps(fscal,dz13);
1141 /* Update vectorial force */
1142 fix1 = _mm_add_ps(fix1,tx);
1143 fiy1 = _mm_add_ps(fiy1,ty);
1144 fiz1 = _mm_add_ps(fiz1,tz);
1146 fjx3 = _mm_add_ps(fjx3,tx);
1147 fjy3 = _mm_add_ps(fjy3,ty);
1148 fjz3 = _mm_add_ps(fjz3,tz);
1152 /**************************
1153 * CALCULATE INTERACTIONS *
1154 **************************/
1156 if (gmx_mm_any_lt(rsq21,rcutoff2))
1159 r21 = _mm_mul_ps(rsq21,rinv21);
1160 r21 = _mm_andnot_ps(dummy_mask,r21);
1162 /* EWALD ELECTROSTATICS */
1164 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1165 ewrt = _mm_mul_ps(r21,ewtabscale);
1166 ewitab = _mm_cvttps_epi32(ewrt);
1167 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1168 ewitab = _mm_slli_epi32(ewitab,2);
1169 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1170 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1171 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1172 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1173 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1174 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1175 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1176 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1177 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1179 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1181 /* Update potential sum for this i atom from the interaction with this j atom. */
1182 velec = _mm_and_ps(velec,cutoff_mask);
1183 velec = _mm_andnot_ps(dummy_mask,velec);
1184 velecsum = _mm_add_ps(velecsum,velec);
1188 fscal = _mm_and_ps(fscal,cutoff_mask);
1190 fscal = _mm_andnot_ps(dummy_mask,fscal);
1192 /* Calculate temporary vectorial force */
1193 tx = _mm_mul_ps(fscal,dx21);
1194 ty = _mm_mul_ps(fscal,dy21);
1195 tz = _mm_mul_ps(fscal,dz21);
1197 /* Update vectorial force */
1198 fix2 = _mm_add_ps(fix2,tx);
1199 fiy2 = _mm_add_ps(fiy2,ty);
1200 fiz2 = _mm_add_ps(fiz2,tz);
1202 fjx1 = _mm_add_ps(fjx1,tx);
1203 fjy1 = _mm_add_ps(fjy1,ty);
1204 fjz1 = _mm_add_ps(fjz1,tz);
1208 /**************************
1209 * CALCULATE INTERACTIONS *
1210 **************************/
1212 if (gmx_mm_any_lt(rsq22,rcutoff2))
1215 r22 = _mm_mul_ps(rsq22,rinv22);
1216 r22 = _mm_andnot_ps(dummy_mask,r22);
1218 /* EWALD ELECTROSTATICS */
1220 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1221 ewrt = _mm_mul_ps(r22,ewtabscale);
1222 ewitab = _mm_cvttps_epi32(ewrt);
1223 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1224 ewitab = _mm_slli_epi32(ewitab,2);
1225 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1226 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1227 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1228 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1229 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1230 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1231 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1232 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1233 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1235 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1237 /* Update potential sum for this i atom from the interaction with this j atom. */
1238 velec = _mm_and_ps(velec,cutoff_mask);
1239 velec = _mm_andnot_ps(dummy_mask,velec);
1240 velecsum = _mm_add_ps(velecsum,velec);
1244 fscal = _mm_and_ps(fscal,cutoff_mask);
1246 fscal = _mm_andnot_ps(dummy_mask,fscal);
1248 /* Calculate temporary vectorial force */
1249 tx = _mm_mul_ps(fscal,dx22);
1250 ty = _mm_mul_ps(fscal,dy22);
1251 tz = _mm_mul_ps(fscal,dz22);
1253 /* Update vectorial force */
1254 fix2 = _mm_add_ps(fix2,tx);
1255 fiy2 = _mm_add_ps(fiy2,ty);
1256 fiz2 = _mm_add_ps(fiz2,tz);
1258 fjx2 = _mm_add_ps(fjx2,tx);
1259 fjy2 = _mm_add_ps(fjy2,ty);
1260 fjz2 = _mm_add_ps(fjz2,tz);
1264 /**************************
1265 * CALCULATE INTERACTIONS *
1266 **************************/
1268 if (gmx_mm_any_lt(rsq23,rcutoff2))
1271 r23 = _mm_mul_ps(rsq23,rinv23);
1272 r23 = _mm_andnot_ps(dummy_mask,r23);
1274 /* EWALD ELECTROSTATICS */
1276 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1277 ewrt = _mm_mul_ps(r23,ewtabscale);
1278 ewitab = _mm_cvttps_epi32(ewrt);
1279 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1280 ewitab = _mm_slli_epi32(ewitab,2);
1281 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1282 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1283 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1284 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1285 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1286 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1287 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1288 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1289 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1291 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1293 /* Update potential sum for this i atom from the interaction with this j atom. */
1294 velec = _mm_and_ps(velec,cutoff_mask);
1295 velec = _mm_andnot_ps(dummy_mask,velec);
1296 velecsum = _mm_add_ps(velecsum,velec);
1300 fscal = _mm_and_ps(fscal,cutoff_mask);
1302 fscal = _mm_andnot_ps(dummy_mask,fscal);
1304 /* Calculate temporary vectorial force */
1305 tx = _mm_mul_ps(fscal,dx23);
1306 ty = _mm_mul_ps(fscal,dy23);
1307 tz = _mm_mul_ps(fscal,dz23);
1309 /* Update vectorial force */
1310 fix2 = _mm_add_ps(fix2,tx);
1311 fiy2 = _mm_add_ps(fiy2,ty);
1312 fiz2 = _mm_add_ps(fiz2,tz);
1314 fjx3 = _mm_add_ps(fjx3,tx);
1315 fjy3 = _mm_add_ps(fjy3,ty);
1316 fjz3 = _mm_add_ps(fjz3,tz);
1320 /**************************
1321 * CALCULATE INTERACTIONS *
1322 **************************/
1324 if (gmx_mm_any_lt(rsq31,rcutoff2))
1327 r31 = _mm_mul_ps(rsq31,rinv31);
1328 r31 = _mm_andnot_ps(dummy_mask,r31);
1330 /* EWALD ELECTROSTATICS */
1332 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1333 ewrt = _mm_mul_ps(r31,ewtabscale);
1334 ewitab = _mm_cvttps_epi32(ewrt);
1335 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1336 ewitab = _mm_slli_epi32(ewitab,2);
1337 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1338 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1339 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1340 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1341 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1342 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1343 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1344 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1345 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1347 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1349 /* Update potential sum for this i atom from the interaction with this j atom. */
1350 velec = _mm_and_ps(velec,cutoff_mask);
1351 velec = _mm_andnot_ps(dummy_mask,velec);
1352 velecsum = _mm_add_ps(velecsum,velec);
1356 fscal = _mm_and_ps(fscal,cutoff_mask);
1358 fscal = _mm_andnot_ps(dummy_mask,fscal);
1360 /* Calculate temporary vectorial force */
1361 tx = _mm_mul_ps(fscal,dx31);
1362 ty = _mm_mul_ps(fscal,dy31);
1363 tz = _mm_mul_ps(fscal,dz31);
1365 /* Update vectorial force */
1366 fix3 = _mm_add_ps(fix3,tx);
1367 fiy3 = _mm_add_ps(fiy3,ty);
1368 fiz3 = _mm_add_ps(fiz3,tz);
1370 fjx1 = _mm_add_ps(fjx1,tx);
1371 fjy1 = _mm_add_ps(fjy1,ty);
1372 fjz1 = _mm_add_ps(fjz1,tz);
1376 /**************************
1377 * CALCULATE INTERACTIONS *
1378 **************************/
1380 if (gmx_mm_any_lt(rsq32,rcutoff2))
1383 r32 = _mm_mul_ps(rsq32,rinv32);
1384 r32 = _mm_andnot_ps(dummy_mask,r32);
1386 /* EWALD ELECTROSTATICS */
1388 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1389 ewrt = _mm_mul_ps(r32,ewtabscale);
1390 ewitab = _mm_cvttps_epi32(ewrt);
1391 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1392 ewitab = _mm_slli_epi32(ewitab,2);
1393 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1394 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1395 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1396 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1397 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1398 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1399 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1400 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1401 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1403 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1405 /* Update potential sum for this i atom from the interaction with this j atom. */
1406 velec = _mm_and_ps(velec,cutoff_mask);
1407 velec = _mm_andnot_ps(dummy_mask,velec);
1408 velecsum = _mm_add_ps(velecsum,velec);
1412 fscal = _mm_and_ps(fscal,cutoff_mask);
1414 fscal = _mm_andnot_ps(dummy_mask,fscal);
1416 /* Calculate temporary vectorial force */
1417 tx = _mm_mul_ps(fscal,dx32);
1418 ty = _mm_mul_ps(fscal,dy32);
1419 tz = _mm_mul_ps(fscal,dz32);
1421 /* Update vectorial force */
1422 fix3 = _mm_add_ps(fix3,tx);
1423 fiy3 = _mm_add_ps(fiy3,ty);
1424 fiz3 = _mm_add_ps(fiz3,tz);
1426 fjx2 = _mm_add_ps(fjx2,tx);
1427 fjy2 = _mm_add_ps(fjy2,ty);
1428 fjz2 = _mm_add_ps(fjz2,tz);
1432 /**************************
1433 * CALCULATE INTERACTIONS *
1434 **************************/
1436 if (gmx_mm_any_lt(rsq33,rcutoff2))
1439 r33 = _mm_mul_ps(rsq33,rinv33);
1440 r33 = _mm_andnot_ps(dummy_mask,r33);
1442 /* EWALD ELECTROSTATICS */
1444 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1445 ewrt = _mm_mul_ps(r33,ewtabscale);
1446 ewitab = _mm_cvttps_epi32(ewrt);
1447 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1448 ewitab = _mm_slli_epi32(ewitab,2);
1449 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1450 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1451 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1452 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1453 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1454 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1455 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1456 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1457 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1459 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1461 /* Update potential sum for this i atom from the interaction with this j atom. */
1462 velec = _mm_and_ps(velec,cutoff_mask);
1463 velec = _mm_andnot_ps(dummy_mask,velec);
1464 velecsum = _mm_add_ps(velecsum,velec);
1468 fscal = _mm_and_ps(fscal,cutoff_mask);
1470 fscal = _mm_andnot_ps(dummy_mask,fscal);
1472 /* Calculate temporary vectorial force */
1473 tx = _mm_mul_ps(fscal,dx33);
1474 ty = _mm_mul_ps(fscal,dy33);
1475 tz = _mm_mul_ps(fscal,dz33);
1477 /* Update vectorial force */
1478 fix3 = _mm_add_ps(fix3,tx);
1479 fiy3 = _mm_add_ps(fiy3,ty);
1480 fiz3 = _mm_add_ps(fiz3,tz);
1482 fjx3 = _mm_add_ps(fjx3,tx);
1483 fjy3 = _mm_add_ps(fjy3,ty);
1484 fjz3 = _mm_add_ps(fjz3,tz);
1488 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1489 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1490 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1491 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1493 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1494 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1495 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1497 /* Inner loop uses 467 flops */
1500 /* End of innermost loop */
1502 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1503 f+i_coord_offset,fshift+i_shift_offset);
1506 /* Update potential energies */
1507 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1508 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1510 /* Increment number of inner iterations */
1511 inneriter += j_index_end - j_index_start;
1513 /* Outer loop uses 26 flops */
1516 /* Increment number of outer iterations */
1519 /* Update outer/inner flops */
1521 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*467);
1524 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
1525 * Electrostatics interaction: Ewald
1526 * VdW interaction: LennardJones
1527 * Geometry: Water4-Water4
1528 * Calculate force/pot: Force
1531 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_sse2_single
1532 (t_nblist * gmx_restrict nlist,
1533 rvec * gmx_restrict xx,
1534 rvec * gmx_restrict ff,
1535 t_forcerec * gmx_restrict fr,
1536 t_mdatoms * gmx_restrict mdatoms,
1537 nb_kernel_data_t * gmx_restrict kernel_data,
1538 t_nrnb * gmx_restrict nrnb)
1540 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1541 * just 0 for non-waters.
1542 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1543 * jnr indices corresponding to data put in the four positions in the SIMD register.
1545 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1546 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1547 int jnrA,jnrB,jnrC,jnrD;
1548 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1549 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1550 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1551 real rcutoff_scalar;
1552 real *shiftvec,*fshift,*x,*f;
1553 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1554 real scratch[4*DIM];
1555 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1557 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1559 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1561 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1563 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1564 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1565 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1566 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1567 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1568 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1569 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1570 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1571 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1572 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1573 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1574 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1575 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1576 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1577 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1578 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1579 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1580 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1581 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1582 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1585 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1588 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1589 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1591 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1593 __m128 dummy_mask,cutoff_mask;
1594 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1595 __m128 one = _mm_set1_ps(1.0);
1596 __m128 two = _mm_set1_ps(2.0);
1602 jindex = nlist->jindex;
1604 shiftidx = nlist->shift;
1606 shiftvec = fr->shift_vec[0];
1607 fshift = fr->fshift[0];
1608 facel = _mm_set1_ps(fr->epsfac);
1609 charge = mdatoms->chargeA;
1610 nvdwtype = fr->ntype;
1611 vdwparam = fr->nbfp;
1612 vdwtype = mdatoms->typeA;
1614 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1615 ewtab = fr->ic->tabq_coul_F;
1616 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1617 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1619 /* Setup water-specific parameters */
1620 inr = nlist->iinr[0];
1621 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1622 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1623 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1624 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1626 jq1 = _mm_set1_ps(charge[inr+1]);
1627 jq2 = _mm_set1_ps(charge[inr+2]);
1628 jq3 = _mm_set1_ps(charge[inr+3]);
1629 vdwjidx0A = 2*vdwtype[inr+0];
1630 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1631 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1632 qq11 = _mm_mul_ps(iq1,jq1);
1633 qq12 = _mm_mul_ps(iq1,jq2);
1634 qq13 = _mm_mul_ps(iq1,jq3);
1635 qq21 = _mm_mul_ps(iq2,jq1);
1636 qq22 = _mm_mul_ps(iq2,jq2);
1637 qq23 = _mm_mul_ps(iq2,jq3);
1638 qq31 = _mm_mul_ps(iq3,jq1);
1639 qq32 = _mm_mul_ps(iq3,jq2);
1640 qq33 = _mm_mul_ps(iq3,jq3);
1642 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1643 rcutoff_scalar = fr->rcoulomb;
1644 rcutoff = _mm_set1_ps(rcutoff_scalar);
1645 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1647 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1648 rvdw = _mm_set1_ps(fr->rvdw);
1650 /* Avoid stupid compiler warnings */
1651 jnrA = jnrB = jnrC = jnrD = 0;
1652 j_coord_offsetA = 0;
1653 j_coord_offsetB = 0;
1654 j_coord_offsetC = 0;
1655 j_coord_offsetD = 0;
1660 for(iidx=0;iidx<4*DIM;iidx++)
1662 scratch[iidx] = 0.0;
1665 /* Start outer loop over neighborlists */
1666 for(iidx=0; iidx<nri; iidx++)
1668 /* Load shift vector for this list */
1669 i_shift_offset = DIM*shiftidx[iidx];
1671 /* Load limits for loop over neighbors */
1672 j_index_start = jindex[iidx];
1673 j_index_end = jindex[iidx+1];
1675 /* Get outer coordinate index */
1677 i_coord_offset = DIM*inr;
1679 /* Load i particle coords and add shift vector */
1680 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1681 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1683 fix0 = _mm_setzero_ps();
1684 fiy0 = _mm_setzero_ps();
1685 fiz0 = _mm_setzero_ps();
1686 fix1 = _mm_setzero_ps();
1687 fiy1 = _mm_setzero_ps();
1688 fiz1 = _mm_setzero_ps();
1689 fix2 = _mm_setzero_ps();
1690 fiy2 = _mm_setzero_ps();
1691 fiz2 = _mm_setzero_ps();
1692 fix3 = _mm_setzero_ps();
1693 fiy3 = _mm_setzero_ps();
1694 fiz3 = _mm_setzero_ps();
1696 /* Start inner kernel loop */
1697 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1700 /* Get j neighbor index, and coordinate index */
1702 jnrB = jjnr[jidx+1];
1703 jnrC = jjnr[jidx+2];
1704 jnrD = jjnr[jidx+3];
1705 j_coord_offsetA = DIM*jnrA;
1706 j_coord_offsetB = DIM*jnrB;
1707 j_coord_offsetC = DIM*jnrC;
1708 j_coord_offsetD = DIM*jnrD;
1710 /* load j atom coordinates */
1711 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1712 x+j_coord_offsetC,x+j_coord_offsetD,
1713 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1714 &jy2,&jz2,&jx3,&jy3,&jz3);
1716 /* Calculate displacement vector */
1717 dx00 = _mm_sub_ps(ix0,jx0);
1718 dy00 = _mm_sub_ps(iy0,jy0);
1719 dz00 = _mm_sub_ps(iz0,jz0);
1720 dx11 = _mm_sub_ps(ix1,jx1);
1721 dy11 = _mm_sub_ps(iy1,jy1);
1722 dz11 = _mm_sub_ps(iz1,jz1);
1723 dx12 = _mm_sub_ps(ix1,jx2);
1724 dy12 = _mm_sub_ps(iy1,jy2);
1725 dz12 = _mm_sub_ps(iz1,jz2);
1726 dx13 = _mm_sub_ps(ix1,jx3);
1727 dy13 = _mm_sub_ps(iy1,jy3);
1728 dz13 = _mm_sub_ps(iz1,jz3);
1729 dx21 = _mm_sub_ps(ix2,jx1);
1730 dy21 = _mm_sub_ps(iy2,jy1);
1731 dz21 = _mm_sub_ps(iz2,jz1);
1732 dx22 = _mm_sub_ps(ix2,jx2);
1733 dy22 = _mm_sub_ps(iy2,jy2);
1734 dz22 = _mm_sub_ps(iz2,jz2);
1735 dx23 = _mm_sub_ps(ix2,jx3);
1736 dy23 = _mm_sub_ps(iy2,jy3);
1737 dz23 = _mm_sub_ps(iz2,jz3);
1738 dx31 = _mm_sub_ps(ix3,jx1);
1739 dy31 = _mm_sub_ps(iy3,jy1);
1740 dz31 = _mm_sub_ps(iz3,jz1);
1741 dx32 = _mm_sub_ps(ix3,jx2);
1742 dy32 = _mm_sub_ps(iy3,jy2);
1743 dz32 = _mm_sub_ps(iz3,jz2);
1744 dx33 = _mm_sub_ps(ix3,jx3);
1745 dy33 = _mm_sub_ps(iy3,jy3);
1746 dz33 = _mm_sub_ps(iz3,jz3);
1748 /* Calculate squared distance and things based on it */
1749 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1750 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1751 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1752 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1753 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1754 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1755 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1756 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1757 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1758 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1760 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1761 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1762 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1763 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1764 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1765 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1766 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1767 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1768 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1770 rinvsq00 = gmx_mm_inv_ps(rsq00);
1771 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1772 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1773 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1774 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1775 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1776 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1777 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1778 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1779 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1781 fjx0 = _mm_setzero_ps();
1782 fjy0 = _mm_setzero_ps();
1783 fjz0 = _mm_setzero_ps();
1784 fjx1 = _mm_setzero_ps();
1785 fjy1 = _mm_setzero_ps();
1786 fjz1 = _mm_setzero_ps();
1787 fjx2 = _mm_setzero_ps();
1788 fjy2 = _mm_setzero_ps();
1789 fjz2 = _mm_setzero_ps();
1790 fjx3 = _mm_setzero_ps();
1791 fjy3 = _mm_setzero_ps();
1792 fjz3 = _mm_setzero_ps();
1794 /**************************
1795 * CALCULATE INTERACTIONS *
1796 **************************/
1798 if (gmx_mm_any_lt(rsq00,rcutoff2))
1801 /* LENNARD-JONES DISPERSION/REPULSION */
1803 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1804 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1806 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1810 fscal = _mm_and_ps(fscal,cutoff_mask);
1812 /* Calculate temporary vectorial force */
1813 tx = _mm_mul_ps(fscal,dx00);
1814 ty = _mm_mul_ps(fscal,dy00);
1815 tz = _mm_mul_ps(fscal,dz00);
1817 /* Update vectorial force */
1818 fix0 = _mm_add_ps(fix0,tx);
1819 fiy0 = _mm_add_ps(fiy0,ty);
1820 fiz0 = _mm_add_ps(fiz0,tz);
1822 fjx0 = _mm_add_ps(fjx0,tx);
1823 fjy0 = _mm_add_ps(fjy0,ty);
1824 fjz0 = _mm_add_ps(fjz0,tz);
1828 /**************************
1829 * CALCULATE INTERACTIONS *
1830 **************************/
1832 if (gmx_mm_any_lt(rsq11,rcutoff2))
1835 r11 = _mm_mul_ps(rsq11,rinv11);
1837 /* EWALD ELECTROSTATICS */
1839 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1840 ewrt = _mm_mul_ps(r11,ewtabscale);
1841 ewitab = _mm_cvttps_epi32(ewrt);
1842 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1843 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1844 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1846 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1847 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1849 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1853 fscal = _mm_and_ps(fscal,cutoff_mask);
1855 /* Calculate temporary vectorial force */
1856 tx = _mm_mul_ps(fscal,dx11);
1857 ty = _mm_mul_ps(fscal,dy11);
1858 tz = _mm_mul_ps(fscal,dz11);
1860 /* Update vectorial force */
1861 fix1 = _mm_add_ps(fix1,tx);
1862 fiy1 = _mm_add_ps(fiy1,ty);
1863 fiz1 = _mm_add_ps(fiz1,tz);
1865 fjx1 = _mm_add_ps(fjx1,tx);
1866 fjy1 = _mm_add_ps(fjy1,ty);
1867 fjz1 = _mm_add_ps(fjz1,tz);
1871 /**************************
1872 * CALCULATE INTERACTIONS *
1873 **************************/
1875 if (gmx_mm_any_lt(rsq12,rcutoff2))
1878 r12 = _mm_mul_ps(rsq12,rinv12);
1880 /* EWALD ELECTROSTATICS */
1882 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1883 ewrt = _mm_mul_ps(r12,ewtabscale);
1884 ewitab = _mm_cvttps_epi32(ewrt);
1885 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1886 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1887 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1889 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1890 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1892 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1896 fscal = _mm_and_ps(fscal,cutoff_mask);
1898 /* Calculate temporary vectorial force */
1899 tx = _mm_mul_ps(fscal,dx12);
1900 ty = _mm_mul_ps(fscal,dy12);
1901 tz = _mm_mul_ps(fscal,dz12);
1903 /* Update vectorial force */
1904 fix1 = _mm_add_ps(fix1,tx);
1905 fiy1 = _mm_add_ps(fiy1,ty);
1906 fiz1 = _mm_add_ps(fiz1,tz);
1908 fjx2 = _mm_add_ps(fjx2,tx);
1909 fjy2 = _mm_add_ps(fjy2,ty);
1910 fjz2 = _mm_add_ps(fjz2,tz);
1914 /**************************
1915 * CALCULATE INTERACTIONS *
1916 **************************/
1918 if (gmx_mm_any_lt(rsq13,rcutoff2))
1921 r13 = _mm_mul_ps(rsq13,rinv13);
1923 /* EWALD ELECTROSTATICS */
1925 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1926 ewrt = _mm_mul_ps(r13,ewtabscale);
1927 ewitab = _mm_cvttps_epi32(ewrt);
1928 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1929 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1930 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1932 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1933 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1935 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1939 fscal = _mm_and_ps(fscal,cutoff_mask);
1941 /* Calculate temporary vectorial force */
1942 tx = _mm_mul_ps(fscal,dx13);
1943 ty = _mm_mul_ps(fscal,dy13);
1944 tz = _mm_mul_ps(fscal,dz13);
1946 /* Update vectorial force */
1947 fix1 = _mm_add_ps(fix1,tx);
1948 fiy1 = _mm_add_ps(fiy1,ty);
1949 fiz1 = _mm_add_ps(fiz1,tz);
1951 fjx3 = _mm_add_ps(fjx3,tx);
1952 fjy3 = _mm_add_ps(fjy3,ty);
1953 fjz3 = _mm_add_ps(fjz3,tz);
1957 /**************************
1958 * CALCULATE INTERACTIONS *
1959 **************************/
1961 if (gmx_mm_any_lt(rsq21,rcutoff2))
1964 r21 = _mm_mul_ps(rsq21,rinv21);
1966 /* EWALD ELECTROSTATICS */
1968 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1969 ewrt = _mm_mul_ps(r21,ewtabscale);
1970 ewitab = _mm_cvttps_epi32(ewrt);
1971 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1972 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1973 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1975 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1976 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1978 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1982 fscal = _mm_and_ps(fscal,cutoff_mask);
1984 /* Calculate temporary vectorial force */
1985 tx = _mm_mul_ps(fscal,dx21);
1986 ty = _mm_mul_ps(fscal,dy21);
1987 tz = _mm_mul_ps(fscal,dz21);
1989 /* Update vectorial force */
1990 fix2 = _mm_add_ps(fix2,tx);
1991 fiy2 = _mm_add_ps(fiy2,ty);
1992 fiz2 = _mm_add_ps(fiz2,tz);
1994 fjx1 = _mm_add_ps(fjx1,tx);
1995 fjy1 = _mm_add_ps(fjy1,ty);
1996 fjz1 = _mm_add_ps(fjz1,tz);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 if (gmx_mm_any_lt(rsq22,rcutoff2))
2007 r22 = _mm_mul_ps(rsq22,rinv22);
2009 /* EWALD ELECTROSTATICS */
2011 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2012 ewrt = _mm_mul_ps(r22,ewtabscale);
2013 ewitab = _mm_cvttps_epi32(ewrt);
2014 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2015 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2016 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2018 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2019 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2021 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2025 fscal = _mm_and_ps(fscal,cutoff_mask);
2027 /* Calculate temporary vectorial force */
2028 tx = _mm_mul_ps(fscal,dx22);
2029 ty = _mm_mul_ps(fscal,dy22);
2030 tz = _mm_mul_ps(fscal,dz22);
2032 /* Update vectorial force */
2033 fix2 = _mm_add_ps(fix2,tx);
2034 fiy2 = _mm_add_ps(fiy2,ty);
2035 fiz2 = _mm_add_ps(fiz2,tz);
2037 fjx2 = _mm_add_ps(fjx2,tx);
2038 fjy2 = _mm_add_ps(fjy2,ty);
2039 fjz2 = _mm_add_ps(fjz2,tz);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 if (gmx_mm_any_lt(rsq23,rcutoff2))
2050 r23 = _mm_mul_ps(rsq23,rinv23);
2052 /* EWALD ELECTROSTATICS */
2054 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2055 ewrt = _mm_mul_ps(r23,ewtabscale);
2056 ewitab = _mm_cvttps_epi32(ewrt);
2057 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2058 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2059 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2061 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2062 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2064 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2068 fscal = _mm_and_ps(fscal,cutoff_mask);
2070 /* Calculate temporary vectorial force */
2071 tx = _mm_mul_ps(fscal,dx23);
2072 ty = _mm_mul_ps(fscal,dy23);
2073 tz = _mm_mul_ps(fscal,dz23);
2075 /* Update vectorial force */
2076 fix2 = _mm_add_ps(fix2,tx);
2077 fiy2 = _mm_add_ps(fiy2,ty);
2078 fiz2 = _mm_add_ps(fiz2,tz);
2080 fjx3 = _mm_add_ps(fjx3,tx);
2081 fjy3 = _mm_add_ps(fjy3,ty);
2082 fjz3 = _mm_add_ps(fjz3,tz);
2086 /**************************
2087 * CALCULATE INTERACTIONS *
2088 **************************/
2090 if (gmx_mm_any_lt(rsq31,rcutoff2))
2093 r31 = _mm_mul_ps(rsq31,rinv31);
2095 /* EWALD ELECTROSTATICS */
2097 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2098 ewrt = _mm_mul_ps(r31,ewtabscale);
2099 ewitab = _mm_cvttps_epi32(ewrt);
2100 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2101 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2102 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2104 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2105 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2107 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2111 fscal = _mm_and_ps(fscal,cutoff_mask);
2113 /* Calculate temporary vectorial force */
2114 tx = _mm_mul_ps(fscal,dx31);
2115 ty = _mm_mul_ps(fscal,dy31);
2116 tz = _mm_mul_ps(fscal,dz31);
2118 /* Update vectorial force */
2119 fix3 = _mm_add_ps(fix3,tx);
2120 fiy3 = _mm_add_ps(fiy3,ty);
2121 fiz3 = _mm_add_ps(fiz3,tz);
2123 fjx1 = _mm_add_ps(fjx1,tx);
2124 fjy1 = _mm_add_ps(fjy1,ty);
2125 fjz1 = _mm_add_ps(fjz1,tz);
2129 /**************************
2130 * CALCULATE INTERACTIONS *
2131 **************************/
2133 if (gmx_mm_any_lt(rsq32,rcutoff2))
2136 r32 = _mm_mul_ps(rsq32,rinv32);
2138 /* EWALD ELECTROSTATICS */
2140 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2141 ewrt = _mm_mul_ps(r32,ewtabscale);
2142 ewitab = _mm_cvttps_epi32(ewrt);
2143 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2144 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2145 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2147 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2148 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2150 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2154 fscal = _mm_and_ps(fscal,cutoff_mask);
2156 /* Calculate temporary vectorial force */
2157 tx = _mm_mul_ps(fscal,dx32);
2158 ty = _mm_mul_ps(fscal,dy32);
2159 tz = _mm_mul_ps(fscal,dz32);
2161 /* Update vectorial force */
2162 fix3 = _mm_add_ps(fix3,tx);
2163 fiy3 = _mm_add_ps(fiy3,ty);
2164 fiz3 = _mm_add_ps(fiz3,tz);
2166 fjx2 = _mm_add_ps(fjx2,tx);
2167 fjy2 = _mm_add_ps(fjy2,ty);
2168 fjz2 = _mm_add_ps(fjz2,tz);
2172 /**************************
2173 * CALCULATE INTERACTIONS *
2174 **************************/
2176 if (gmx_mm_any_lt(rsq33,rcutoff2))
2179 r33 = _mm_mul_ps(rsq33,rinv33);
2181 /* EWALD ELECTROSTATICS */
2183 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2184 ewrt = _mm_mul_ps(r33,ewtabscale);
2185 ewitab = _mm_cvttps_epi32(ewrt);
2186 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2187 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2188 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2190 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2191 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2193 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2197 fscal = _mm_and_ps(fscal,cutoff_mask);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_ps(fscal,dx33);
2201 ty = _mm_mul_ps(fscal,dy33);
2202 tz = _mm_mul_ps(fscal,dz33);
2204 /* Update vectorial force */
2205 fix3 = _mm_add_ps(fix3,tx);
2206 fiy3 = _mm_add_ps(fiy3,ty);
2207 fiz3 = _mm_add_ps(fiz3,tz);
2209 fjx3 = _mm_add_ps(fjx3,tx);
2210 fjy3 = _mm_add_ps(fjy3,ty);
2211 fjz3 = _mm_add_ps(fjz3,tz);
2215 fjptrA = f+j_coord_offsetA;
2216 fjptrB = f+j_coord_offsetB;
2217 fjptrC = f+j_coord_offsetC;
2218 fjptrD = f+j_coord_offsetD;
2220 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2221 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2222 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2224 /* Inner loop uses 384 flops */
2227 if(jidx<j_index_end)
2230 /* Get j neighbor index, and coordinate index */
2231 jnrlistA = jjnr[jidx];
2232 jnrlistB = jjnr[jidx+1];
2233 jnrlistC = jjnr[jidx+2];
2234 jnrlistD = jjnr[jidx+3];
2235 /* Sign of each element will be negative for non-real atoms.
2236 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2237 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2239 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2240 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2241 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2242 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2243 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2244 j_coord_offsetA = DIM*jnrA;
2245 j_coord_offsetB = DIM*jnrB;
2246 j_coord_offsetC = DIM*jnrC;
2247 j_coord_offsetD = DIM*jnrD;
2249 /* load j atom coordinates */
2250 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2251 x+j_coord_offsetC,x+j_coord_offsetD,
2252 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2253 &jy2,&jz2,&jx3,&jy3,&jz3);
2255 /* Calculate displacement vector */
2256 dx00 = _mm_sub_ps(ix0,jx0);
2257 dy00 = _mm_sub_ps(iy0,jy0);
2258 dz00 = _mm_sub_ps(iz0,jz0);
2259 dx11 = _mm_sub_ps(ix1,jx1);
2260 dy11 = _mm_sub_ps(iy1,jy1);
2261 dz11 = _mm_sub_ps(iz1,jz1);
2262 dx12 = _mm_sub_ps(ix1,jx2);
2263 dy12 = _mm_sub_ps(iy1,jy2);
2264 dz12 = _mm_sub_ps(iz1,jz2);
2265 dx13 = _mm_sub_ps(ix1,jx3);
2266 dy13 = _mm_sub_ps(iy1,jy3);
2267 dz13 = _mm_sub_ps(iz1,jz3);
2268 dx21 = _mm_sub_ps(ix2,jx1);
2269 dy21 = _mm_sub_ps(iy2,jy1);
2270 dz21 = _mm_sub_ps(iz2,jz1);
2271 dx22 = _mm_sub_ps(ix2,jx2);
2272 dy22 = _mm_sub_ps(iy2,jy2);
2273 dz22 = _mm_sub_ps(iz2,jz2);
2274 dx23 = _mm_sub_ps(ix2,jx3);
2275 dy23 = _mm_sub_ps(iy2,jy3);
2276 dz23 = _mm_sub_ps(iz2,jz3);
2277 dx31 = _mm_sub_ps(ix3,jx1);
2278 dy31 = _mm_sub_ps(iy3,jy1);
2279 dz31 = _mm_sub_ps(iz3,jz1);
2280 dx32 = _mm_sub_ps(ix3,jx2);
2281 dy32 = _mm_sub_ps(iy3,jy2);
2282 dz32 = _mm_sub_ps(iz3,jz2);
2283 dx33 = _mm_sub_ps(ix3,jx3);
2284 dy33 = _mm_sub_ps(iy3,jy3);
2285 dz33 = _mm_sub_ps(iz3,jz3);
2287 /* Calculate squared distance and things based on it */
2288 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2289 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2290 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2291 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2292 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2293 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2294 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2295 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2296 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2297 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2299 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2300 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2301 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2302 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2303 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2304 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2305 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2306 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2307 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2309 rinvsq00 = gmx_mm_inv_ps(rsq00);
2310 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2311 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2312 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2313 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2314 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2315 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2316 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2317 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2318 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2320 fjx0 = _mm_setzero_ps();
2321 fjy0 = _mm_setzero_ps();
2322 fjz0 = _mm_setzero_ps();
2323 fjx1 = _mm_setzero_ps();
2324 fjy1 = _mm_setzero_ps();
2325 fjz1 = _mm_setzero_ps();
2326 fjx2 = _mm_setzero_ps();
2327 fjy2 = _mm_setzero_ps();
2328 fjz2 = _mm_setzero_ps();
2329 fjx3 = _mm_setzero_ps();
2330 fjy3 = _mm_setzero_ps();
2331 fjz3 = _mm_setzero_ps();
2333 /**************************
2334 * CALCULATE INTERACTIONS *
2335 **************************/
2337 if (gmx_mm_any_lt(rsq00,rcutoff2))
2340 /* LENNARD-JONES DISPERSION/REPULSION */
2342 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2343 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2345 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2349 fscal = _mm_and_ps(fscal,cutoff_mask);
2351 fscal = _mm_andnot_ps(dummy_mask,fscal);
2353 /* Calculate temporary vectorial force */
2354 tx = _mm_mul_ps(fscal,dx00);
2355 ty = _mm_mul_ps(fscal,dy00);
2356 tz = _mm_mul_ps(fscal,dz00);
2358 /* Update vectorial force */
2359 fix0 = _mm_add_ps(fix0,tx);
2360 fiy0 = _mm_add_ps(fiy0,ty);
2361 fiz0 = _mm_add_ps(fiz0,tz);
2363 fjx0 = _mm_add_ps(fjx0,tx);
2364 fjy0 = _mm_add_ps(fjy0,ty);
2365 fjz0 = _mm_add_ps(fjz0,tz);
2369 /**************************
2370 * CALCULATE INTERACTIONS *
2371 **************************/
2373 if (gmx_mm_any_lt(rsq11,rcutoff2))
2376 r11 = _mm_mul_ps(rsq11,rinv11);
2377 r11 = _mm_andnot_ps(dummy_mask,r11);
2379 /* EWALD ELECTROSTATICS */
2381 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2382 ewrt = _mm_mul_ps(r11,ewtabscale);
2383 ewitab = _mm_cvttps_epi32(ewrt);
2384 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2385 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2386 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2388 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2389 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2391 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2395 fscal = _mm_and_ps(fscal,cutoff_mask);
2397 fscal = _mm_andnot_ps(dummy_mask,fscal);
2399 /* Calculate temporary vectorial force */
2400 tx = _mm_mul_ps(fscal,dx11);
2401 ty = _mm_mul_ps(fscal,dy11);
2402 tz = _mm_mul_ps(fscal,dz11);
2404 /* Update vectorial force */
2405 fix1 = _mm_add_ps(fix1,tx);
2406 fiy1 = _mm_add_ps(fiy1,ty);
2407 fiz1 = _mm_add_ps(fiz1,tz);
2409 fjx1 = _mm_add_ps(fjx1,tx);
2410 fjy1 = _mm_add_ps(fjy1,ty);
2411 fjz1 = _mm_add_ps(fjz1,tz);
2415 /**************************
2416 * CALCULATE INTERACTIONS *
2417 **************************/
2419 if (gmx_mm_any_lt(rsq12,rcutoff2))
2422 r12 = _mm_mul_ps(rsq12,rinv12);
2423 r12 = _mm_andnot_ps(dummy_mask,r12);
2425 /* EWALD ELECTROSTATICS */
2427 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2428 ewrt = _mm_mul_ps(r12,ewtabscale);
2429 ewitab = _mm_cvttps_epi32(ewrt);
2430 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2431 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2432 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2434 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2435 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2437 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2441 fscal = _mm_and_ps(fscal,cutoff_mask);
2443 fscal = _mm_andnot_ps(dummy_mask,fscal);
2445 /* Calculate temporary vectorial force */
2446 tx = _mm_mul_ps(fscal,dx12);
2447 ty = _mm_mul_ps(fscal,dy12);
2448 tz = _mm_mul_ps(fscal,dz12);
2450 /* Update vectorial force */
2451 fix1 = _mm_add_ps(fix1,tx);
2452 fiy1 = _mm_add_ps(fiy1,ty);
2453 fiz1 = _mm_add_ps(fiz1,tz);
2455 fjx2 = _mm_add_ps(fjx2,tx);
2456 fjy2 = _mm_add_ps(fjy2,ty);
2457 fjz2 = _mm_add_ps(fjz2,tz);
2461 /**************************
2462 * CALCULATE INTERACTIONS *
2463 **************************/
2465 if (gmx_mm_any_lt(rsq13,rcutoff2))
2468 r13 = _mm_mul_ps(rsq13,rinv13);
2469 r13 = _mm_andnot_ps(dummy_mask,r13);
2471 /* EWALD ELECTROSTATICS */
2473 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2474 ewrt = _mm_mul_ps(r13,ewtabscale);
2475 ewitab = _mm_cvttps_epi32(ewrt);
2476 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2477 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2478 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2480 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2481 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2483 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2487 fscal = _mm_and_ps(fscal,cutoff_mask);
2489 fscal = _mm_andnot_ps(dummy_mask,fscal);
2491 /* Calculate temporary vectorial force */
2492 tx = _mm_mul_ps(fscal,dx13);
2493 ty = _mm_mul_ps(fscal,dy13);
2494 tz = _mm_mul_ps(fscal,dz13);
2496 /* Update vectorial force */
2497 fix1 = _mm_add_ps(fix1,tx);
2498 fiy1 = _mm_add_ps(fiy1,ty);
2499 fiz1 = _mm_add_ps(fiz1,tz);
2501 fjx3 = _mm_add_ps(fjx3,tx);
2502 fjy3 = _mm_add_ps(fjy3,ty);
2503 fjz3 = _mm_add_ps(fjz3,tz);
2507 /**************************
2508 * CALCULATE INTERACTIONS *
2509 **************************/
2511 if (gmx_mm_any_lt(rsq21,rcutoff2))
2514 r21 = _mm_mul_ps(rsq21,rinv21);
2515 r21 = _mm_andnot_ps(dummy_mask,r21);
2517 /* EWALD ELECTROSTATICS */
2519 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2520 ewrt = _mm_mul_ps(r21,ewtabscale);
2521 ewitab = _mm_cvttps_epi32(ewrt);
2522 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2523 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2524 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2526 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2527 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2529 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2533 fscal = _mm_and_ps(fscal,cutoff_mask);
2535 fscal = _mm_andnot_ps(dummy_mask,fscal);
2537 /* Calculate temporary vectorial force */
2538 tx = _mm_mul_ps(fscal,dx21);
2539 ty = _mm_mul_ps(fscal,dy21);
2540 tz = _mm_mul_ps(fscal,dz21);
2542 /* Update vectorial force */
2543 fix2 = _mm_add_ps(fix2,tx);
2544 fiy2 = _mm_add_ps(fiy2,ty);
2545 fiz2 = _mm_add_ps(fiz2,tz);
2547 fjx1 = _mm_add_ps(fjx1,tx);
2548 fjy1 = _mm_add_ps(fjy1,ty);
2549 fjz1 = _mm_add_ps(fjz1,tz);
2553 /**************************
2554 * CALCULATE INTERACTIONS *
2555 **************************/
2557 if (gmx_mm_any_lt(rsq22,rcutoff2))
2560 r22 = _mm_mul_ps(rsq22,rinv22);
2561 r22 = _mm_andnot_ps(dummy_mask,r22);
2563 /* EWALD ELECTROSTATICS */
2565 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2566 ewrt = _mm_mul_ps(r22,ewtabscale);
2567 ewitab = _mm_cvttps_epi32(ewrt);
2568 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2569 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2570 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2572 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2573 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2575 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2579 fscal = _mm_and_ps(fscal,cutoff_mask);
2581 fscal = _mm_andnot_ps(dummy_mask,fscal);
2583 /* Calculate temporary vectorial force */
2584 tx = _mm_mul_ps(fscal,dx22);
2585 ty = _mm_mul_ps(fscal,dy22);
2586 tz = _mm_mul_ps(fscal,dz22);
2588 /* Update vectorial force */
2589 fix2 = _mm_add_ps(fix2,tx);
2590 fiy2 = _mm_add_ps(fiy2,ty);
2591 fiz2 = _mm_add_ps(fiz2,tz);
2593 fjx2 = _mm_add_ps(fjx2,tx);
2594 fjy2 = _mm_add_ps(fjy2,ty);
2595 fjz2 = _mm_add_ps(fjz2,tz);
2599 /**************************
2600 * CALCULATE INTERACTIONS *
2601 **************************/
2603 if (gmx_mm_any_lt(rsq23,rcutoff2))
2606 r23 = _mm_mul_ps(rsq23,rinv23);
2607 r23 = _mm_andnot_ps(dummy_mask,r23);
2609 /* EWALD ELECTROSTATICS */
2611 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2612 ewrt = _mm_mul_ps(r23,ewtabscale);
2613 ewitab = _mm_cvttps_epi32(ewrt);
2614 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2615 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2616 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2618 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2619 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2621 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2625 fscal = _mm_and_ps(fscal,cutoff_mask);
2627 fscal = _mm_andnot_ps(dummy_mask,fscal);
2629 /* Calculate temporary vectorial force */
2630 tx = _mm_mul_ps(fscal,dx23);
2631 ty = _mm_mul_ps(fscal,dy23);
2632 tz = _mm_mul_ps(fscal,dz23);
2634 /* Update vectorial force */
2635 fix2 = _mm_add_ps(fix2,tx);
2636 fiy2 = _mm_add_ps(fiy2,ty);
2637 fiz2 = _mm_add_ps(fiz2,tz);
2639 fjx3 = _mm_add_ps(fjx3,tx);
2640 fjy3 = _mm_add_ps(fjy3,ty);
2641 fjz3 = _mm_add_ps(fjz3,tz);
2645 /**************************
2646 * CALCULATE INTERACTIONS *
2647 **************************/
2649 if (gmx_mm_any_lt(rsq31,rcutoff2))
2652 r31 = _mm_mul_ps(rsq31,rinv31);
2653 r31 = _mm_andnot_ps(dummy_mask,r31);
2655 /* EWALD ELECTROSTATICS */
2657 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2658 ewrt = _mm_mul_ps(r31,ewtabscale);
2659 ewitab = _mm_cvttps_epi32(ewrt);
2660 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2661 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2662 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2664 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2665 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2667 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2671 fscal = _mm_and_ps(fscal,cutoff_mask);
2673 fscal = _mm_andnot_ps(dummy_mask,fscal);
2675 /* Calculate temporary vectorial force */
2676 tx = _mm_mul_ps(fscal,dx31);
2677 ty = _mm_mul_ps(fscal,dy31);
2678 tz = _mm_mul_ps(fscal,dz31);
2680 /* Update vectorial force */
2681 fix3 = _mm_add_ps(fix3,tx);
2682 fiy3 = _mm_add_ps(fiy3,ty);
2683 fiz3 = _mm_add_ps(fiz3,tz);
2685 fjx1 = _mm_add_ps(fjx1,tx);
2686 fjy1 = _mm_add_ps(fjy1,ty);
2687 fjz1 = _mm_add_ps(fjz1,tz);
2691 /**************************
2692 * CALCULATE INTERACTIONS *
2693 **************************/
2695 if (gmx_mm_any_lt(rsq32,rcutoff2))
2698 r32 = _mm_mul_ps(rsq32,rinv32);
2699 r32 = _mm_andnot_ps(dummy_mask,r32);
2701 /* EWALD ELECTROSTATICS */
2703 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2704 ewrt = _mm_mul_ps(r32,ewtabscale);
2705 ewitab = _mm_cvttps_epi32(ewrt);
2706 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2707 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2708 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2710 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2711 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2713 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2717 fscal = _mm_and_ps(fscal,cutoff_mask);
2719 fscal = _mm_andnot_ps(dummy_mask,fscal);
2721 /* Calculate temporary vectorial force */
2722 tx = _mm_mul_ps(fscal,dx32);
2723 ty = _mm_mul_ps(fscal,dy32);
2724 tz = _mm_mul_ps(fscal,dz32);
2726 /* Update vectorial force */
2727 fix3 = _mm_add_ps(fix3,tx);
2728 fiy3 = _mm_add_ps(fiy3,ty);
2729 fiz3 = _mm_add_ps(fiz3,tz);
2731 fjx2 = _mm_add_ps(fjx2,tx);
2732 fjy2 = _mm_add_ps(fjy2,ty);
2733 fjz2 = _mm_add_ps(fjz2,tz);
2737 /**************************
2738 * CALCULATE INTERACTIONS *
2739 **************************/
2741 if (gmx_mm_any_lt(rsq33,rcutoff2))
2744 r33 = _mm_mul_ps(rsq33,rinv33);
2745 r33 = _mm_andnot_ps(dummy_mask,r33);
2747 /* EWALD ELECTROSTATICS */
2749 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2750 ewrt = _mm_mul_ps(r33,ewtabscale);
2751 ewitab = _mm_cvttps_epi32(ewrt);
2752 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2753 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2754 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2756 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2757 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2759 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2763 fscal = _mm_and_ps(fscal,cutoff_mask);
2765 fscal = _mm_andnot_ps(dummy_mask,fscal);
2767 /* Calculate temporary vectorial force */
2768 tx = _mm_mul_ps(fscal,dx33);
2769 ty = _mm_mul_ps(fscal,dy33);
2770 tz = _mm_mul_ps(fscal,dz33);
2772 /* Update vectorial force */
2773 fix3 = _mm_add_ps(fix3,tx);
2774 fiy3 = _mm_add_ps(fiy3,ty);
2775 fiz3 = _mm_add_ps(fiz3,tz);
2777 fjx3 = _mm_add_ps(fjx3,tx);
2778 fjy3 = _mm_add_ps(fjy3,ty);
2779 fjz3 = _mm_add_ps(fjz3,tz);
2783 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2784 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2785 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2786 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2788 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2789 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2790 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2792 /* Inner loop uses 393 flops */
2795 /* End of innermost loop */
2797 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2798 f+i_coord_offset,fshift+i_shift_offset);
2800 /* Increment number of inner iterations */
2801 inneriter += j_index_end - j_index_start;
2803 /* Outer loop uses 24 flops */
2806 /* Increment number of outer iterations */
2809 /* Update outer/inner flops */
2811 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*393);