2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_sse4_1_single.h"
48 #include "kernelutil_x86_sse4_1_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_VF_sse4_1_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
116 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
127 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
129 __m128 one_half = _mm_set1_ps(0.5);
130 __m128 minus_one = _mm_set1_ps(-1.0);
132 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
134 __m128 dummy_mask,cutoff_mask;
135 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
136 __m128 one = _mm_set1_ps(1.0);
137 __m128 two = _mm_set1_ps(2.0);
143 jindex = nlist->jindex;
145 shiftidx = nlist->shift;
147 shiftvec = fr->shift_vec[0];
148 fshift = fr->fshift[0];
149 facel = _mm_set1_ps(fr->epsfac);
150 charge = mdatoms->chargeA;
151 nvdwtype = fr->ntype;
153 vdwtype = mdatoms->typeA;
154 vdwgridparam = fr->ljpme_c6grid;
155 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
156 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
157 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
159 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
160 ewtab = fr->ic->tabq_coul_FDV0;
161 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
162 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
164 /* Setup water-specific parameters */
165 inr = nlist->iinr[0];
166 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
167 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
168 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
169 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
171 jq1 = _mm_set1_ps(charge[inr+1]);
172 jq2 = _mm_set1_ps(charge[inr+2]);
173 jq3 = _mm_set1_ps(charge[inr+3]);
174 vdwjidx0A = 2*vdwtype[inr+0];
175 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
176 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
177 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
178 qq11 = _mm_mul_ps(iq1,jq1);
179 qq12 = _mm_mul_ps(iq1,jq2);
180 qq13 = _mm_mul_ps(iq1,jq3);
181 qq21 = _mm_mul_ps(iq2,jq1);
182 qq22 = _mm_mul_ps(iq2,jq2);
183 qq23 = _mm_mul_ps(iq2,jq3);
184 qq31 = _mm_mul_ps(iq3,jq1);
185 qq32 = _mm_mul_ps(iq3,jq2);
186 qq33 = _mm_mul_ps(iq3,jq3);
188 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
189 rcutoff_scalar = fr->rcoulomb;
190 rcutoff = _mm_set1_ps(rcutoff_scalar);
191 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
193 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
194 rvdw = _mm_set1_ps(fr->rvdw);
196 /* Avoid stupid compiler warnings */
197 jnrA = jnrB = jnrC = jnrD = 0;
206 for(iidx=0;iidx<4*DIM;iidx++)
211 /* Start outer loop over neighborlists */
212 for(iidx=0; iidx<nri; iidx++)
214 /* Load shift vector for this list */
215 i_shift_offset = DIM*shiftidx[iidx];
217 /* Load limits for loop over neighbors */
218 j_index_start = jindex[iidx];
219 j_index_end = jindex[iidx+1];
221 /* Get outer coordinate index */
223 i_coord_offset = DIM*inr;
225 /* Load i particle coords and add shift vector */
226 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
227 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
229 fix0 = _mm_setzero_ps();
230 fiy0 = _mm_setzero_ps();
231 fiz0 = _mm_setzero_ps();
232 fix1 = _mm_setzero_ps();
233 fiy1 = _mm_setzero_ps();
234 fiz1 = _mm_setzero_ps();
235 fix2 = _mm_setzero_ps();
236 fiy2 = _mm_setzero_ps();
237 fiz2 = _mm_setzero_ps();
238 fix3 = _mm_setzero_ps();
239 fiy3 = _mm_setzero_ps();
240 fiz3 = _mm_setzero_ps();
242 /* Reset potential sums */
243 velecsum = _mm_setzero_ps();
244 vvdwsum = _mm_setzero_ps();
246 /* Start inner kernel loop */
247 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
250 /* Get j neighbor index, and coordinate index */
255 j_coord_offsetA = DIM*jnrA;
256 j_coord_offsetB = DIM*jnrB;
257 j_coord_offsetC = DIM*jnrC;
258 j_coord_offsetD = DIM*jnrD;
260 /* load j atom coordinates */
261 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
262 x+j_coord_offsetC,x+j_coord_offsetD,
263 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
264 &jy2,&jz2,&jx3,&jy3,&jz3);
266 /* Calculate displacement vector */
267 dx00 = _mm_sub_ps(ix0,jx0);
268 dy00 = _mm_sub_ps(iy0,jy0);
269 dz00 = _mm_sub_ps(iz0,jz0);
270 dx11 = _mm_sub_ps(ix1,jx1);
271 dy11 = _mm_sub_ps(iy1,jy1);
272 dz11 = _mm_sub_ps(iz1,jz1);
273 dx12 = _mm_sub_ps(ix1,jx2);
274 dy12 = _mm_sub_ps(iy1,jy2);
275 dz12 = _mm_sub_ps(iz1,jz2);
276 dx13 = _mm_sub_ps(ix1,jx3);
277 dy13 = _mm_sub_ps(iy1,jy3);
278 dz13 = _mm_sub_ps(iz1,jz3);
279 dx21 = _mm_sub_ps(ix2,jx1);
280 dy21 = _mm_sub_ps(iy2,jy1);
281 dz21 = _mm_sub_ps(iz2,jz1);
282 dx22 = _mm_sub_ps(ix2,jx2);
283 dy22 = _mm_sub_ps(iy2,jy2);
284 dz22 = _mm_sub_ps(iz2,jz2);
285 dx23 = _mm_sub_ps(ix2,jx3);
286 dy23 = _mm_sub_ps(iy2,jy3);
287 dz23 = _mm_sub_ps(iz2,jz3);
288 dx31 = _mm_sub_ps(ix3,jx1);
289 dy31 = _mm_sub_ps(iy3,jy1);
290 dz31 = _mm_sub_ps(iz3,jz1);
291 dx32 = _mm_sub_ps(ix3,jx2);
292 dy32 = _mm_sub_ps(iy3,jy2);
293 dz32 = _mm_sub_ps(iz3,jz2);
294 dx33 = _mm_sub_ps(ix3,jx3);
295 dy33 = _mm_sub_ps(iy3,jy3);
296 dz33 = _mm_sub_ps(iz3,jz3);
298 /* Calculate squared distance and things based on it */
299 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
300 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
301 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
302 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
303 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
304 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
305 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
306 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
307 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
308 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
310 rinv00 = gmx_mm_invsqrt_ps(rsq00);
311 rinv11 = gmx_mm_invsqrt_ps(rsq11);
312 rinv12 = gmx_mm_invsqrt_ps(rsq12);
313 rinv13 = gmx_mm_invsqrt_ps(rsq13);
314 rinv21 = gmx_mm_invsqrt_ps(rsq21);
315 rinv22 = gmx_mm_invsqrt_ps(rsq22);
316 rinv23 = gmx_mm_invsqrt_ps(rsq23);
317 rinv31 = gmx_mm_invsqrt_ps(rsq31);
318 rinv32 = gmx_mm_invsqrt_ps(rsq32);
319 rinv33 = gmx_mm_invsqrt_ps(rsq33);
321 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
322 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
323 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
324 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
325 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
326 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
327 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
328 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
329 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
330 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
332 fjx0 = _mm_setzero_ps();
333 fjy0 = _mm_setzero_ps();
334 fjz0 = _mm_setzero_ps();
335 fjx1 = _mm_setzero_ps();
336 fjy1 = _mm_setzero_ps();
337 fjz1 = _mm_setzero_ps();
338 fjx2 = _mm_setzero_ps();
339 fjy2 = _mm_setzero_ps();
340 fjz2 = _mm_setzero_ps();
341 fjx3 = _mm_setzero_ps();
342 fjy3 = _mm_setzero_ps();
343 fjz3 = _mm_setzero_ps();
345 /**************************
346 * CALCULATE INTERACTIONS *
347 **************************/
349 if (gmx_mm_any_lt(rsq00,rcutoff2))
352 r00 = _mm_mul_ps(rsq00,rinv00);
354 /* Analytical LJ-PME */
355 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
356 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
357 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
358 exponent = gmx_simd_exp_r(ewcljrsq);
359 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
360 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
361 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
362 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
363 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
364 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
365 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
366 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
367 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
369 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
371 /* Update potential sum for this i atom from the interaction with this j atom. */
372 vvdw = _mm_and_ps(vvdw,cutoff_mask);
373 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
377 fscal = _mm_and_ps(fscal,cutoff_mask);
379 /* Calculate temporary vectorial force */
380 tx = _mm_mul_ps(fscal,dx00);
381 ty = _mm_mul_ps(fscal,dy00);
382 tz = _mm_mul_ps(fscal,dz00);
384 /* Update vectorial force */
385 fix0 = _mm_add_ps(fix0,tx);
386 fiy0 = _mm_add_ps(fiy0,ty);
387 fiz0 = _mm_add_ps(fiz0,tz);
389 fjx0 = _mm_add_ps(fjx0,tx);
390 fjy0 = _mm_add_ps(fjy0,ty);
391 fjz0 = _mm_add_ps(fjz0,tz);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 if (gmx_mm_any_lt(rsq11,rcutoff2))
402 r11 = _mm_mul_ps(rsq11,rinv11);
404 /* EWALD ELECTROSTATICS */
406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
407 ewrt = _mm_mul_ps(r11,ewtabscale);
408 ewitab = _mm_cvttps_epi32(ewrt);
409 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
410 ewitab = _mm_slli_epi32(ewitab,2);
411 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
412 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
413 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
414 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
415 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
416 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
417 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
418 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
419 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
421 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velec = _mm_and_ps(velec,cutoff_mask);
425 velecsum = _mm_add_ps(velecsum,velec);
429 fscal = _mm_and_ps(fscal,cutoff_mask);
431 /* Calculate temporary vectorial force */
432 tx = _mm_mul_ps(fscal,dx11);
433 ty = _mm_mul_ps(fscal,dy11);
434 tz = _mm_mul_ps(fscal,dz11);
436 /* Update vectorial force */
437 fix1 = _mm_add_ps(fix1,tx);
438 fiy1 = _mm_add_ps(fiy1,ty);
439 fiz1 = _mm_add_ps(fiz1,tz);
441 fjx1 = _mm_add_ps(fjx1,tx);
442 fjy1 = _mm_add_ps(fjy1,ty);
443 fjz1 = _mm_add_ps(fjz1,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 if (gmx_mm_any_lt(rsq12,rcutoff2))
454 r12 = _mm_mul_ps(rsq12,rinv12);
456 /* EWALD ELECTROSTATICS */
458 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
459 ewrt = _mm_mul_ps(r12,ewtabscale);
460 ewitab = _mm_cvttps_epi32(ewrt);
461 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
462 ewitab = _mm_slli_epi32(ewitab,2);
463 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
464 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
465 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
466 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
467 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
468 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
469 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
470 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
471 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
473 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
475 /* Update potential sum for this i atom from the interaction with this j atom. */
476 velec = _mm_and_ps(velec,cutoff_mask);
477 velecsum = _mm_add_ps(velecsum,velec);
481 fscal = _mm_and_ps(fscal,cutoff_mask);
483 /* Calculate temporary vectorial force */
484 tx = _mm_mul_ps(fscal,dx12);
485 ty = _mm_mul_ps(fscal,dy12);
486 tz = _mm_mul_ps(fscal,dz12);
488 /* Update vectorial force */
489 fix1 = _mm_add_ps(fix1,tx);
490 fiy1 = _mm_add_ps(fiy1,ty);
491 fiz1 = _mm_add_ps(fiz1,tz);
493 fjx2 = _mm_add_ps(fjx2,tx);
494 fjy2 = _mm_add_ps(fjy2,ty);
495 fjz2 = _mm_add_ps(fjz2,tz);
499 /**************************
500 * CALCULATE INTERACTIONS *
501 **************************/
503 if (gmx_mm_any_lt(rsq13,rcutoff2))
506 r13 = _mm_mul_ps(rsq13,rinv13);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_ps(r13,ewtabscale);
512 ewitab = _mm_cvttps_epi32(ewrt);
513 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
523 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
525 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velec = _mm_and_ps(velec,cutoff_mask);
529 velecsum = _mm_add_ps(velecsum,velec);
533 fscal = _mm_and_ps(fscal,cutoff_mask);
535 /* Calculate temporary vectorial force */
536 tx = _mm_mul_ps(fscal,dx13);
537 ty = _mm_mul_ps(fscal,dy13);
538 tz = _mm_mul_ps(fscal,dz13);
540 /* Update vectorial force */
541 fix1 = _mm_add_ps(fix1,tx);
542 fiy1 = _mm_add_ps(fiy1,ty);
543 fiz1 = _mm_add_ps(fiz1,tz);
545 fjx3 = _mm_add_ps(fjx3,tx);
546 fjy3 = _mm_add_ps(fjy3,ty);
547 fjz3 = _mm_add_ps(fjz3,tz);
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 if (gmx_mm_any_lt(rsq21,rcutoff2))
558 r21 = _mm_mul_ps(rsq21,rinv21);
560 /* EWALD ELECTROSTATICS */
562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563 ewrt = _mm_mul_ps(r21,ewtabscale);
564 ewitab = _mm_cvttps_epi32(ewrt);
565 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
566 ewitab = _mm_slli_epi32(ewitab,2);
567 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
568 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
569 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
570 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
571 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
572 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
573 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
574 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
575 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
577 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
579 /* Update potential sum for this i atom from the interaction with this j atom. */
580 velec = _mm_and_ps(velec,cutoff_mask);
581 velecsum = _mm_add_ps(velecsum,velec);
585 fscal = _mm_and_ps(fscal,cutoff_mask);
587 /* Calculate temporary vectorial force */
588 tx = _mm_mul_ps(fscal,dx21);
589 ty = _mm_mul_ps(fscal,dy21);
590 tz = _mm_mul_ps(fscal,dz21);
592 /* Update vectorial force */
593 fix2 = _mm_add_ps(fix2,tx);
594 fiy2 = _mm_add_ps(fiy2,ty);
595 fiz2 = _mm_add_ps(fiz2,tz);
597 fjx1 = _mm_add_ps(fjx1,tx);
598 fjy1 = _mm_add_ps(fjy1,ty);
599 fjz1 = _mm_add_ps(fjz1,tz);
603 /**************************
604 * CALCULATE INTERACTIONS *
605 **************************/
607 if (gmx_mm_any_lt(rsq22,rcutoff2))
610 r22 = _mm_mul_ps(rsq22,rinv22);
612 /* EWALD ELECTROSTATICS */
614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
615 ewrt = _mm_mul_ps(r22,ewtabscale);
616 ewitab = _mm_cvttps_epi32(ewrt);
617 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
618 ewitab = _mm_slli_epi32(ewitab,2);
619 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
620 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
621 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
622 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
623 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
624 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
625 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
626 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
627 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
629 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
631 /* Update potential sum for this i atom from the interaction with this j atom. */
632 velec = _mm_and_ps(velec,cutoff_mask);
633 velecsum = _mm_add_ps(velecsum,velec);
637 fscal = _mm_and_ps(fscal,cutoff_mask);
639 /* Calculate temporary vectorial force */
640 tx = _mm_mul_ps(fscal,dx22);
641 ty = _mm_mul_ps(fscal,dy22);
642 tz = _mm_mul_ps(fscal,dz22);
644 /* Update vectorial force */
645 fix2 = _mm_add_ps(fix2,tx);
646 fiy2 = _mm_add_ps(fiy2,ty);
647 fiz2 = _mm_add_ps(fiz2,tz);
649 fjx2 = _mm_add_ps(fjx2,tx);
650 fjy2 = _mm_add_ps(fjy2,ty);
651 fjz2 = _mm_add_ps(fjz2,tz);
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 if (gmx_mm_any_lt(rsq23,rcutoff2))
662 r23 = _mm_mul_ps(rsq23,rinv23);
664 /* EWALD ELECTROSTATICS */
666 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
667 ewrt = _mm_mul_ps(r23,ewtabscale);
668 ewitab = _mm_cvttps_epi32(ewrt);
669 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
670 ewitab = _mm_slli_epi32(ewitab,2);
671 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
672 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
673 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
674 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
675 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
676 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
677 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
678 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
679 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
681 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
683 /* Update potential sum for this i atom from the interaction with this j atom. */
684 velec = _mm_and_ps(velec,cutoff_mask);
685 velecsum = _mm_add_ps(velecsum,velec);
689 fscal = _mm_and_ps(fscal,cutoff_mask);
691 /* Calculate temporary vectorial force */
692 tx = _mm_mul_ps(fscal,dx23);
693 ty = _mm_mul_ps(fscal,dy23);
694 tz = _mm_mul_ps(fscal,dz23);
696 /* Update vectorial force */
697 fix2 = _mm_add_ps(fix2,tx);
698 fiy2 = _mm_add_ps(fiy2,ty);
699 fiz2 = _mm_add_ps(fiz2,tz);
701 fjx3 = _mm_add_ps(fjx3,tx);
702 fjy3 = _mm_add_ps(fjy3,ty);
703 fjz3 = _mm_add_ps(fjz3,tz);
707 /**************************
708 * CALCULATE INTERACTIONS *
709 **************************/
711 if (gmx_mm_any_lt(rsq31,rcutoff2))
714 r31 = _mm_mul_ps(rsq31,rinv31);
716 /* EWALD ELECTROSTATICS */
718 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
719 ewrt = _mm_mul_ps(r31,ewtabscale);
720 ewitab = _mm_cvttps_epi32(ewrt);
721 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
722 ewitab = _mm_slli_epi32(ewitab,2);
723 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
724 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
725 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
726 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
727 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
728 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
729 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
730 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
731 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
733 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec = _mm_and_ps(velec,cutoff_mask);
737 velecsum = _mm_add_ps(velecsum,velec);
741 fscal = _mm_and_ps(fscal,cutoff_mask);
743 /* Calculate temporary vectorial force */
744 tx = _mm_mul_ps(fscal,dx31);
745 ty = _mm_mul_ps(fscal,dy31);
746 tz = _mm_mul_ps(fscal,dz31);
748 /* Update vectorial force */
749 fix3 = _mm_add_ps(fix3,tx);
750 fiy3 = _mm_add_ps(fiy3,ty);
751 fiz3 = _mm_add_ps(fiz3,tz);
753 fjx1 = _mm_add_ps(fjx1,tx);
754 fjy1 = _mm_add_ps(fjy1,ty);
755 fjz1 = _mm_add_ps(fjz1,tz);
759 /**************************
760 * CALCULATE INTERACTIONS *
761 **************************/
763 if (gmx_mm_any_lt(rsq32,rcutoff2))
766 r32 = _mm_mul_ps(rsq32,rinv32);
768 /* EWALD ELECTROSTATICS */
770 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
771 ewrt = _mm_mul_ps(r32,ewtabscale);
772 ewitab = _mm_cvttps_epi32(ewrt);
773 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
774 ewitab = _mm_slli_epi32(ewitab,2);
775 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
776 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
777 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
778 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
779 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
780 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
781 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
782 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
783 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
785 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
787 /* Update potential sum for this i atom from the interaction with this j atom. */
788 velec = _mm_and_ps(velec,cutoff_mask);
789 velecsum = _mm_add_ps(velecsum,velec);
793 fscal = _mm_and_ps(fscal,cutoff_mask);
795 /* Calculate temporary vectorial force */
796 tx = _mm_mul_ps(fscal,dx32);
797 ty = _mm_mul_ps(fscal,dy32);
798 tz = _mm_mul_ps(fscal,dz32);
800 /* Update vectorial force */
801 fix3 = _mm_add_ps(fix3,tx);
802 fiy3 = _mm_add_ps(fiy3,ty);
803 fiz3 = _mm_add_ps(fiz3,tz);
805 fjx2 = _mm_add_ps(fjx2,tx);
806 fjy2 = _mm_add_ps(fjy2,ty);
807 fjz2 = _mm_add_ps(fjz2,tz);
811 /**************************
812 * CALCULATE INTERACTIONS *
813 **************************/
815 if (gmx_mm_any_lt(rsq33,rcutoff2))
818 r33 = _mm_mul_ps(rsq33,rinv33);
820 /* EWALD ELECTROSTATICS */
822 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
823 ewrt = _mm_mul_ps(r33,ewtabscale);
824 ewitab = _mm_cvttps_epi32(ewrt);
825 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
826 ewitab = _mm_slli_epi32(ewitab,2);
827 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
828 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
829 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
830 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
831 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
832 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
833 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
834 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
835 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
837 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
839 /* Update potential sum for this i atom from the interaction with this j atom. */
840 velec = _mm_and_ps(velec,cutoff_mask);
841 velecsum = _mm_add_ps(velecsum,velec);
845 fscal = _mm_and_ps(fscal,cutoff_mask);
847 /* Calculate temporary vectorial force */
848 tx = _mm_mul_ps(fscal,dx33);
849 ty = _mm_mul_ps(fscal,dy33);
850 tz = _mm_mul_ps(fscal,dz33);
852 /* Update vectorial force */
853 fix3 = _mm_add_ps(fix3,tx);
854 fiy3 = _mm_add_ps(fiy3,ty);
855 fiz3 = _mm_add_ps(fiz3,tz);
857 fjx3 = _mm_add_ps(fjx3,tx);
858 fjy3 = _mm_add_ps(fjy3,ty);
859 fjz3 = _mm_add_ps(fjz3,tz);
863 fjptrA = f+j_coord_offsetA;
864 fjptrB = f+j_coord_offsetB;
865 fjptrC = f+j_coord_offsetC;
866 fjptrD = f+j_coord_offsetD;
868 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
869 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
870 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
872 /* Inner loop uses 479 flops */
878 /* Get j neighbor index, and coordinate index */
879 jnrlistA = jjnr[jidx];
880 jnrlistB = jjnr[jidx+1];
881 jnrlistC = jjnr[jidx+2];
882 jnrlistD = jjnr[jidx+3];
883 /* Sign of each element will be negative for non-real atoms.
884 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
885 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
887 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
888 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
889 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
890 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
891 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
892 j_coord_offsetA = DIM*jnrA;
893 j_coord_offsetB = DIM*jnrB;
894 j_coord_offsetC = DIM*jnrC;
895 j_coord_offsetD = DIM*jnrD;
897 /* load j atom coordinates */
898 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
899 x+j_coord_offsetC,x+j_coord_offsetD,
900 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
901 &jy2,&jz2,&jx3,&jy3,&jz3);
903 /* Calculate displacement vector */
904 dx00 = _mm_sub_ps(ix0,jx0);
905 dy00 = _mm_sub_ps(iy0,jy0);
906 dz00 = _mm_sub_ps(iz0,jz0);
907 dx11 = _mm_sub_ps(ix1,jx1);
908 dy11 = _mm_sub_ps(iy1,jy1);
909 dz11 = _mm_sub_ps(iz1,jz1);
910 dx12 = _mm_sub_ps(ix1,jx2);
911 dy12 = _mm_sub_ps(iy1,jy2);
912 dz12 = _mm_sub_ps(iz1,jz2);
913 dx13 = _mm_sub_ps(ix1,jx3);
914 dy13 = _mm_sub_ps(iy1,jy3);
915 dz13 = _mm_sub_ps(iz1,jz3);
916 dx21 = _mm_sub_ps(ix2,jx1);
917 dy21 = _mm_sub_ps(iy2,jy1);
918 dz21 = _mm_sub_ps(iz2,jz1);
919 dx22 = _mm_sub_ps(ix2,jx2);
920 dy22 = _mm_sub_ps(iy2,jy2);
921 dz22 = _mm_sub_ps(iz2,jz2);
922 dx23 = _mm_sub_ps(ix2,jx3);
923 dy23 = _mm_sub_ps(iy2,jy3);
924 dz23 = _mm_sub_ps(iz2,jz3);
925 dx31 = _mm_sub_ps(ix3,jx1);
926 dy31 = _mm_sub_ps(iy3,jy1);
927 dz31 = _mm_sub_ps(iz3,jz1);
928 dx32 = _mm_sub_ps(ix3,jx2);
929 dy32 = _mm_sub_ps(iy3,jy2);
930 dz32 = _mm_sub_ps(iz3,jz2);
931 dx33 = _mm_sub_ps(ix3,jx3);
932 dy33 = _mm_sub_ps(iy3,jy3);
933 dz33 = _mm_sub_ps(iz3,jz3);
935 /* Calculate squared distance and things based on it */
936 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
937 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
938 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
939 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
940 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
941 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
942 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
943 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
944 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
945 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
947 rinv00 = gmx_mm_invsqrt_ps(rsq00);
948 rinv11 = gmx_mm_invsqrt_ps(rsq11);
949 rinv12 = gmx_mm_invsqrt_ps(rsq12);
950 rinv13 = gmx_mm_invsqrt_ps(rsq13);
951 rinv21 = gmx_mm_invsqrt_ps(rsq21);
952 rinv22 = gmx_mm_invsqrt_ps(rsq22);
953 rinv23 = gmx_mm_invsqrt_ps(rsq23);
954 rinv31 = gmx_mm_invsqrt_ps(rsq31);
955 rinv32 = gmx_mm_invsqrt_ps(rsq32);
956 rinv33 = gmx_mm_invsqrt_ps(rsq33);
958 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
959 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
960 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
961 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
962 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
963 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
964 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
965 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
966 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
967 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
969 fjx0 = _mm_setzero_ps();
970 fjy0 = _mm_setzero_ps();
971 fjz0 = _mm_setzero_ps();
972 fjx1 = _mm_setzero_ps();
973 fjy1 = _mm_setzero_ps();
974 fjz1 = _mm_setzero_ps();
975 fjx2 = _mm_setzero_ps();
976 fjy2 = _mm_setzero_ps();
977 fjz2 = _mm_setzero_ps();
978 fjx3 = _mm_setzero_ps();
979 fjy3 = _mm_setzero_ps();
980 fjz3 = _mm_setzero_ps();
982 /**************************
983 * CALCULATE INTERACTIONS *
984 **************************/
986 if (gmx_mm_any_lt(rsq00,rcutoff2))
989 r00 = _mm_mul_ps(rsq00,rinv00);
990 r00 = _mm_andnot_ps(dummy_mask,r00);
992 /* Analytical LJ-PME */
993 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
994 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
995 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
996 exponent = gmx_simd_exp_r(ewcljrsq);
997 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
998 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
999 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
1000 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
1001 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1002 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))),one_twelfth),
1003 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
1004 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
1005 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
1007 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1009 /* Update potential sum for this i atom from the interaction with this j atom. */
1010 vvdw = _mm_and_ps(vvdw,cutoff_mask);
1011 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
1012 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
1016 fscal = _mm_and_ps(fscal,cutoff_mask);
1018 fscal = _mm_andnot_ps(dummy_mask,fscal);
1020 /* Calculate temporary vectorial force */
1021 tx = _mm_mul_ps(fscal,dx00);
1022 ty = _mm_mul_ps(fscal,dy00);
1023 tz = _mm_mul_ps(fscal,dz00);
1025 /* Update vectorial force */
1026 fix0 = _mm_add_ps(fix0,tx);
1027 fiy0 = _mm_add_ps(fiy0,ty);
1028 fiz0 = _mm_add_ps(fiz0,tz);
1030 fjx0 = _mm_add_ps(fjx0,tx);
1031 fjy0 = _mm_add_ps(fjy0,ty);
1032 fjz0 = _mm_add_ps(fjz0,tz);
1036 /**************************
1037 * CALCULATE INTERACTIONS *
1038 **************************/
1040 if (gmx_mm_any_lt(rsq11,rcutoff2))
1043 r11 = _mm_mul_ps(rsq11,rinv11);
1044 r11 = _mm_andnot_ps(dummy_mask,r11);
1046 /* EWALD ELECTROSTATICS */
1048 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1049 ewrt = _mm_mul_ps(r11,ewtabscale);
1050 ewitab = _mm_cvttps_epi32(ewrt);
1051 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1052 ewitab = _mm_slli_epi32(ewitab,2);
1053 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1054 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1055 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1056 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1057 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1058 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1059 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1060 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1061 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1063 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm_and_ps(velec,cutoff_mask);
1067 velec = _mm_andnot_ps(dummy_mask,velec);
1068 velecsum = _mm_add_ps(velecsum,velec);
1072 fscal = _mm_and_ps(fscal,cutoff_mask);
1074 fscal = _mm_andnot_ps(dummy_mask,fscal);
1076 /* Calculate temporary vectorial force */
1077 tx = _mm_mul_ps(fscal,dx11);
1078 ty = _mm_mul_ps(fscal,dy11);
1079 tz = _mm_mul_ps(fscal,dz11);
1081 /* Update vectorial force */
1082 fix1 = _mm_add_ps(fix1,tx);
1083 fiy1 = _mm_add_ps(fiy1,ty);
1084 fiz1 = _mm_add_ps(fiz1,tz);
1086 fjx1 = _mm_add_ps(fjx1,tx);
1087 fjy1 = _mm_add_ps(fjy1,ty);
1088 fjz1 = _mm_add_ps(fjz1,tz);
1092 /**************************
1093 * CALCULATE INTERACTIONS *
1094 **************************/
1096 if (gmx_mm_any_lt(rsq12,rcutoff2))
1099 r12 = _mm_mul_ps(rsq12,rinv12);
1100 r12 = _mm_andnot_ps(dummy_mask,r12);
1102 /* EWALD ELECTROSTATICS */
1104 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1105 ewrt = _mm_mul_ps(r12,ewtabscale);
1106 ewitab = _mm_cvttps_epi32(ewrt);
1107 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1108 ewitab = _mm_slli_epi32(ewitab,2);
1109 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1110 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1111 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1112 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1113 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1114 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1115 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1116 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1117 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1119 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec = _mm_and_ps(velec,cutoff_mask);
1123 velec = _mm_andnot_ps(dummy_mask,velec);
1124 velecsum = _mm_add_ps(velecsum,velec);
1128 fscal = _mm_and_ps(fscal,cutoff_mask);
1130 fscal = _mm_andnot_ps(dummy_mask,fscal);
1132 /* Calculate temporary vectorial force */
1133 tx = _mm_mul_ps(fscal,dx12);
1134 ty = _mm_mul_ps(fscal,dy12);
1135 tz = _mm_mul_ps(fscal,dz12);
1137 /* Update vectorial force */
1138 fix1 = _mm_add_ps(fix1,tx);
1139 fiy1 = _mm_add_ps(fiy1,ty);
1140 fiz1 = _mm_add_ps(fiz1,tz);
1142 fjx2 = _mm_add_ps(fjx2,tx);
1143 fjy2 = _mm_add_ps(fjy2,ty);
1144 fjz2 = _mm_add_ps(fjz2,tz);
1148 /**************************
1149 * CALCULATE INTERACTIONS *
1150 **************************/
1152 if (gmx_mm_any_lt(rsq13,rcutoff2))
1155 r13 = _mm_mul_ps(rsq13,rinv13);
1156 r13 = _mm_andnot_ps(dummy_mask,r13);
1158 /* EWALD ELECTROSTATICS */
1160 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1161 ewrt = _mm_mul_ps(r13,ewtabscale);
1162 ewitab = _mm_cvttps_epi32(ewrt);
1163 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1164 ewitab = _mm_slli_epi32(ewitab,2);
1165 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1166 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1167 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1168 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1169 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1170 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1171 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1172 velec = _mm_mul_ps(qq13,_mm_sub_ps(_mm_sub_ps(rinv13,sh_ewald),velec));
1173 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1175 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm_and_ps(velec,cutoff_mask);
1179 velec = _mm_andnot_ps(dummy_mask,velec);
1180 velecsum = _mm_add_ps(velecsum,velec);
1184 fscal = _mm_and_ps(fscal,cutoff_mask);
1186 fscal = _mm_andnot_ps(dummy_mask,fscal);
1188 /* Calculate temporary vectorial force */
1189 tx = _mm_mul_ps(fscal,dx13);
1190 ty = _mm_mul_ps(fscal,dy13);
1191 tz = _mm_mul_ps(fscal,dz13);
1193 /* Update vectorial force */
1194 fix1 = _mm_add_ps(fix1,tx);
1195 fiy1 = _mm_add_ps(fiy1,ty);
1196 fiz1 = _mm_add_ps(fiz1,tz);
1198 fjx3 = _mm_add_ps(fjx3,tx);
1199 fjy3 = _mm_add_ps(fjy3,ty);
1200 fjz3 = _mm_add_ps(fjz3,tz);
1204 /**************************
1205 * CALCULATE INTERACTIONS *
1206 **************************/
1208 if (gmx_mm_any_lt(rsq21,rcutoff2))
1211 r21 = _mm_mul_ps(rsq21,rinv21);
1212 r21 = _mm_andnot_ps(dummy_mask,r21);
1214 /* EWALD ELECTROSTATICS */
1216 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1217 ewrt = _mm_mul_ps(r21,ewtabscale);
1218 ewitab = _mm_cvttps_epi32(ewrt);
1219 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1220 ewitab = _mm_slli_epi32(ewitab,2);
1221 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1222 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1223 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1224 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1225 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1226 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1227 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1228 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1229 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1231 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1233 /* Update potential sum for this i atom from the interaction with this j atom. */
1234 velec = _mm_and_ps(velec,cutoff_mask);
1235 velec = _mm_andnot_ps(dummy_mask,velec);
1236 velecsum = _mm_add_ps(velecsum,velec);
1240 fscal = _mm_and_ps(fscal,cutoff_mask);
1242 fscal = _mm_andnot_ps(dummy_mask,fscal);
1244 /* Calculate temporary vectorial force */
1245 tx = _mm_mul_ps(fscal,dx21);
1246 ty = _mm_mul_ps(fscal,dy21);
1247 tz = _mm_mul_ps(fscal,dz21);
1249 /* Update vectorial force */
1250 fix2 = _mm_add_ps(fix2,tx);
1251 fiy2 = _mm_add_ps(fiy2,ty);
1252 fiz2 = _mm_add_ps(fiz2,tz);
1254 fjx1 = _mm_add_ps(fjx1,tx);
1255 fjy1 = _mm_add_ps(fjy1,ty);
1256 fjz1 = _mm_add_ps(fjz1,tz);
1260 /**************************
1261 * CALCULATE INTERACTIONS *
1262 **************************/
1264 if (gmx_mm_any_lt(rsq22,rcutoff2))
1267 r22 = _mm_mul_ps(rsq22,rinv22);
1268 r22 = _mm_andnot_ps(dummy_mask,r22);
1270 /* EWALD ELECTROSTATICS */
1272 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1273 ewrt = _mm_mul_ps(r22,ewtabscale);
1274 ewitab = _mm_cvttps_epi32(ewrt);
1275 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1276 ewitab = _mm_slli_epi32(ewitab,2);
1277 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1278 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1279 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1280 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1281 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1282 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1283 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1284 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1285 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1287 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1289 /* Update potential sum for this i atom from the interaction with this j atom. */
1290 velec = _mm_and_ps(velec,cutoff_mask);
1291 velec = _mm_andnot_ps(dummy_mask,velec);
1292 velecsum = _mm_add_ps(velecsum,velec);
1296 fscal = _mm_and_ps(fscal,cutoff_mask);
1298 fscal = _mm_andnot_ps(dummy_mask,fscal);
1300 /* Calculate temporary vectorial force */
1301 tx = _mm_mul_ps(fscal,dx22);
1302 ty = _mm_mul_ps(fscal,dy22);
1303 tz = _mm_mul_ps(fscal,dz22);
1305 /* Update vectorial force */
1306 fix2 = _mm_add_ps(fix2,tx);
1307 fiy2 = _mm_add_ps(fiy2,ty);
1308 fiz2 = _mm_add_ps(fiz2,tz);
1310 fjx2 = _mm_add_ps(fjx2,tx);
1311 fjy2 = _mm_add_ps(fjy2,ty);
1312 fjz2 = _mm_add_ps(fjz2,tz);
1316 /**************************
1317 * CALCULATE INTERACTIONS *
1318 **************************/
1320 if (gmx_mm_any_lt(rsq23,rcutoff2))
1323 r23 = _mm_mul_ps(rsq23,rinv23);
1324 r23 = _mm_andnot_ps(dummy_mask,r23);
1326 /* EWALD ELECTROSTATICS */
1328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1329 ewrt = _mm_mul_ps(r23,ewtabscale);
1330 ewitab = _mm_cvttps_epi32(ewrt);
1331 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1332 ewitab = _mm_slli_epi32(ewitab,2);
1333 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1334 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1335 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1336 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1337 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1338 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1339 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1340 velec = _mm_mul_ps(qq23,_mm_sub_ps(_mm_sub_ps(rinv23,sh_ewald),velec));
1341 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1343 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
1345 /* Update potential sum for this i atom from the interaction with this j atom. */
1346 velec = _mm_and_ps(velec,cutoff_mask);
1347 velec = _mm_andnot_ps(dummy_mask,velec);
1348 velecsum = _mm_add_ps(velecsum,velec);
1352 fscal = _mm_and_ps(fscal,cutoff_mask);
1354 fscal = _mm_andnot_ps(dummy_mask,fscal);
1356 /* Calculate temporary vectorial force */
1357 tx = _mm_mul_ps(fscal,dx23);
1358 ty = _mm_mul_ps(fscal,dy23);
1359 tz = _mm_mul_ps(fscal,dz23);
1361 /* Update vectorial force */
1362 fix2 = _mm_add_ps(fix2,tx);
1363 fiy2 = _mm_add_ps(fiy2,ty);
1364 fiz2 = _mm_add_ps(fiz2,tz);
1366 fjx3 = _mm_add_ps(fjx3,tx);
1367 fjy3 = _mm_add_ps(fjy3,ty);
1368 fjz3 = _mm_add_ps(fjz3,tz);
1372 /**************************
1373 * CALCULATE INTERACTIONS *
1374 **************************/
1376 if (gmx_mm_any_lt(rsq31,rcutoff2))
1379 r31 = _mm_mul_ps(rsq31,rinv31);
1380 r31 = _mm_andnot_ps(dummy_mask,r31);
1382 /* EWALD ELECTROSTATICS */
1384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1385 ewrt = _mm_mul_ps(r31,ewtabscale);
1386 ewitab = _mm_cvttps_epi32(ewrt);
1387 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1388 ewitab = _mm_slli_epi32(ewitab,2);
1389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1396 velec = _mm_mul_ps(qq31,_mm_sub_ps(_mm_sub_ps(rinv31,sh_ewald),velec));
1397 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1399 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
1401 /* Update potential sum for this i atom from the interaction with this j atom. */
1402 velec = _mm_and_ps(velec,cutoff_mask);
1403 velec = _mm_andnot_ps(dummy_mask,velec);
1404 velecsum = _mm_add_ps(velecsum,velec);
1408 fscal = _mm_and_ps(fscal,cutoff_mask);
1410 fscal = _mm_andnot_ps(dummy_mask,fscal);
1412 /* Calculate temporary vectorial force */
1413 tx = _mm_mul_ps(fscal,dx31);
1414 ty = _mm_mul_ps(fscal,dy31);
1415 tz = _mm_mul_ps(fscal,dz31);
1417 /* Update vectorial force */
1418 fix3 = _mm_add_ps(fix3,tx);
1419 fiy3 = _mm_add_ps(fiy3,ty);
1420 fiz3 = _mm_add_ps(fiz3,tz);
1422 fjx1 = _mm_add_ps(fjx1,tx);
1423 fjy1 = _mm_add_ps(fjy1,ty);
1424 fjz1 = _mm_add_ps(fjz1,tz);
1428 /**************************
1429 * CALCULATE INTERACTIONS *
1430 **************************/
1432 if (gmx_mm_any_lt(rsq32,rcutoff2))
1435 r32 = _mm_mul_ps(rsq32,rinv32);
1436 r32 = _mm_andnot_ps(dummy_mask,r32);
1438 /* EWALD ELECTROSTATICS */
1440 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1441 ewrt = _mm_mul_ps(r32,ewtabscale);
1442 ewitab = _mm_cvttps_epi32(ewrt);
1443 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1444 ewitab = _mm_slli_epi32(ewitab,2);
1445 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1446 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1447 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1448 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1449 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1450 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1451 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1452 velec = _mm_mul_ps(qq32,_mm_sub_ps(_mm_sub_ps(rinv32,sh_ewald),velec));
1453 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1455 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
1457 /* Update potential sum for this i atom from the interaction with this j atom. */
1458 velec = _mm_and_ps(velec,cutoff_mask);
1459 velec = _mm_andnot_ps(dummy_mask,velec);
1460 velecsum = _mm_add_ps(velecsum,velec);
1464 fscal = _mm_and_ps(fscal,cutoff_mask);
1466 fscal = _mm_andnot_ps(dummy_mask,fscal);
1468 /* Calculate temporary vectorial force */
1469 tx = _mm_mul_ps(fscal,dx32);
1470 ty = _mm_mul_ps(fscal,dy32);
1471 tz = _mm_mul_ps(fscal,dz32);
1473 /* Update vectorial force */
1474 fix3 = _mm_add_ps(fix3,tx);
1475 fiy3 = _mm_add_ps(fiy3,ty);
1476 fiz3 = _mm_add_ps(fiz3,tz);
1478 fjx2 = _mm_add_ps(fjx2,tx);
1479 fjy2 = _mm_add_ps(fjy2,ty);
1480 fjz2 = _mm_add_ps(fjz2,tz);
1484 /**************************
1485 * CALCULATE INTERACTIONS *
1486 **************************/
1488 if (gmx_mm_any_lt(rsq33,rcutoff2))
1491 r33 = _mm_mul_ps(rsq33,rinv33);
1492 r33 = _mm_andnot_ps(dummy_mask,r33);
1494 /* EWALD ELECTROSTATICS */
1496 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1497 ewrt = _mm_mul_ps(r33,ewtabscale);
1498 ewitab = _mm_cvttps_epi32(ewrt);
1499 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1500 ewitab = _mm_slli_epi32(ewitab,2);
1501 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1502 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1503 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1504 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1505 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1506 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1507 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1508 velec = _mm_mul_ps(qq33,_mm_sub_ps(_mm_sub_ps(rinv33,sh_ewald),velec));
1509 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1511 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
1513 /* Update potential sum for this i atom from the interaction with this j atom. */
1514 velec = _mm_and_ps(velec,cutoff_mask);
1515 velec = _mm_andnot_ps(dummy_mask,velec);
1516 velecsum = _mm_add_ps(velecsum,velec);
1520 fscal = _mm_and_ps(fscal,cutoff_mask);
1522 fscal = _mm_andnot_ps(dummy_mask,fscal);
1524 /* Calculate temporary vectorial force */
1525 tx = _mm_mul_ps(fscal,dx33);
1526 ty = _mm_mul_ps(fscal,dy33);
1527 tz = _mm_mul_ps(fscal,dz33);
1529 /* Update vectorial force */
1530 fix3 = _mm_add_ps(fix3,tx);
1531 fiy3 = _mm_add_ps(fiy3,ty);
1532 fiz3 = _mm_add_ps(fiz3,tz);
1534 fjx3 = _mm_add_ps(fjx3,tx);
1535 fjy3 = _mm_add_ps(fjy3,ty);
1536 fjz3 = _mm_add_ps(fjz3,tz);
1540 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1541 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1542 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1543 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1545 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1546 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1547 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1549 /* Inner loop uses 489 flops */
1552 /* End of innermost loop */
1554 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1555 f+i_coord_offset,fshift+i_shift_offset);
1558 /* Update potential energies */
1559 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1560 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1562 /* Increment number of inner iterations */
1563 inneriter += j_index_end - j_index_start;
1565 /* Outer loop uses 26 flops */
1568 /* Increment number of outer iterations */
1571 /* Update outer/inner flops */
1573 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*489);
1576 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single
1577 * Electrostatics interaction: Ewald
1578 * VdW interaction: LJEwald
1579 * Geometry: Water4-Water4
1580 * Calculate force/pot: Force
1583 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW4W4_F_sse4_1_single
1584 (t_nblist * gmx_restrict nlist,
1585 rvec * gmx_restrict xx,
1586 rvec * gmx_restrict ff,
1587 t_forcerec * gmx_restrict fr,
1588 t_mdatoms * gmx_restrict mdatoms,
1589 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1590 t_nrnb * gmx_restrict nrnb)
1592 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1593 * just 0 for non-waters.
1594 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1595 * jnr indices corresponding to data put in the four positions in the SIMD register.
1597 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1598 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1599 int jnrA,jnrB,jnrC,jnrD;
1600 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1601 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1602 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1603 real rcutoff_scalar;
1604 real *shiftvec,*fshift,*x,*f;
1605 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1606 real scratch[4*DIM];
1607 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1609 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1611 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1613 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1615 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1616 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1617 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1618 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1619 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1620 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1621 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1622 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1623 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1624 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1625 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1626 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1627 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1628 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1629 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1630 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1631 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1632 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1633 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1634 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1637 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1640 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1641 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1652 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1654 __m128 one_half = _mm_set1_ps(0.5);
1655 __m128 minus_one = _mm_set1_ps(-1.0);
1657 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1659 __m128 dummy_mask,cutoff_mask;
1660 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1661 __m128 one = _mm_set1_ps(1.0);
1662 __m128 two = _mm_set1_ps(2.0);
1668 jindex = nlist->jindex;
1670 shiftidx = nlist->shift;
1672 shiftvec = fr->shift_vec[0];
1673 fshift = fr->fshift[0];
1674 facel = _mm_set1_ps(fr->epsfac);
1675 charge = mdatoms->chargeA;
1676 nvdwtype = fr->ntype;
1677 vdwparam = fr->nbfp;
1678 vdwtype = mdatoms->typeA;
1679 vdwgridparam = fr->ljpme_c6grid;
1680 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1681 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1682 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1684 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1685 ewtab = fr->ic->tabq_coul_F;
1686 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1687 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1689 /* Setup water-specific parameters */
1690 inr = nlist->iinr[0];
1691 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1692 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1693 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1694 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1696 jq1 = _mm_set1_ps(charge[inr+1]);
1697 jq2 = _mm_set1_ps(charge[inr+2]);
1698 jq3 = _mm_set1_ps(charge[inr+3]);
1699 vdwjidx0A = 2*vdwtype[inr+0];
1700 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1701 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1702 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1703 qq11 = _mm_mul_ps(iq1,jq1);
1704 qq12 = _mm_mul_ps(iq1,jq2);
1705 qq13 = _mm_mul_ps(iq1,jq3);
1706 qq21 = _mm_mul_ps(iq2,jq1);
1707 qq22 = _mm_mul_ps(iq2,jq2);
1708 qq23 = _mm_mul_ps(iq2,jq3);
1709 qq31 = _mm_mul_ps(iq3,jq1);
1710 qq32 = _mm_mul_ps(iq3,jq2);
1711 qq33 = _mm_mul_ps(iq3,jq3);
1713 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1714 rcutoff_scalar = fr->rcoulomb;
1715 rcutoff = _mm_set1_ps(rcutoff_scalar);
1716 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1718 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1719 rvdw = _mm_set1_ps(fr->rvdw);
1721 /* Avoid stupid compiler warnings */
1722 jnrA = jnrB = jnrC = jnrD = 0;
1723 j_coord_offsetA = 0;
1724 j_coord_offsetB = 0;
1725 j_coord_offsetC = 0;
1726 j_coord_offsetD = 0;
1731 for(iidx=0;iidx<4*DIM;iidx++)
1733 scratch[iidx] = 0.0;
1736 /* Start outer loop over neighborlists */
1737 for(iidx=0; iidx<nri; iidx++)
1739 /* Load shift vector for this list */
1740 i_shift_offset = DIM*shiftidx[iidx];
1742 /* Load limits for loop over neighbors */
1743 j_index_start = jindex[iidx];
1744 j_index_end = jindex[iidx+1];
1746 /* Get outer coordinate index */
1748 i_coord_offset = DIM*inr;
1750 /* Load i particle coords and add shift vector */
1751 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1752 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1754 fix0 = _mm_setzero_ps();
1755 fiy0 = _mm_setzero_ps();
1756 fiz0 = _mm_setzero_ps();
1757 fix1 = _mm_setzero_ps();
1758 fiy1 = _mm_setzero_ps();
1759 fiz1 = _mm_setzero_ps();
1760 fix2 = _mm_setzero_ps();
1761 fiy2 = _mm_setzero_ps();
1762 fiz2 = _mm_setzero_ps();
1763 fix3 = _mm_setzero_ps();
1764 fiy3 = _mm_setzero_ps();
1765 fiz3 = _mm_setzero_ps();
1767 /* Start inner kernel loop */
1768 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1771 /* Get j neighbor index, and coordinate index */
1773 jnrB = jjnr[jidx+1];
1774 jnrC = jjnr[jidx+2];
1775 jnrD = jjnr[jidx+3];
1776 j_coord_offsetA = DIM*jnrA;
1777 j_coord_offsetB = DIM*jnrB;
1778 j_coord_offsetC = DIM*jnrC;
1779 j_coord_offsetD = DIM*jnrD;
1781 /* load j atom coordinates */
1782 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1783 x+j_coord_offsetC,x+j_coord_offsetD,
1784 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1785 &jy2,&jz2,&jx3,&jy3,&jz3);
1787 /* Calculate displacement vector */
1788 dx00 = _mm_sub_ps(ix0,jx0);
1789 dy00 = _mm_sub_ps(iy0,jy0);
1790 dz00 = _mm_sub_ps(iz0,jz0);
1791 dx11 = _mm_sub_ps(ix1,jx1);
1792 dy11 = _mm_sub_ps(iy1,jy1);
1793 dz11 = _mm_sub_ps(iz1,jz1);
1794 dx12 = _mm_sub_ps(ix1,jx2);
1795 dy12 = _mm_sub_ps(iy1,jy2);
1796 dz12 = _mm_sub_ps(iz1,jz2);
1797 dx13 = _mm_sub_ps(ix1,jx3);
1798 dy13 = _mm_sub_ps(iy1,jy3);
1799 dz13 = _mm_sub_ps(iz1,jz3);
1800 dx21 = _mm_sub_ps(ix2,jx1);
1801 dy21 = _mm_sub_ps(iy2,jy1);
1802 dz21 = _mm_sub_ps(iz2,jz1);
1803 dx22 = _mm_sub_ps(ix2,jx2);
1804 dy22 = _mm_sub_ps(iy2,jy2);
1805 dz22 = _mm_sub_ps(iz2,jz2);
1806 dx23 = _mm_sub_ps(ix2,jx3);
1807 dy23 = _mm_sub_ps(iy2,jy3);
1808 dz23 = _mm_sub_ps(iz2,jz3);
1809 dx31 = _mm_sub_ps(ix3,jx1);
1810 dy31 = _mm_sub_ps(iy3,jy1);
1811 dz31 = _mm_sub_ps(iz3,jz1);
1812 dx32 = _mm_sub_ps(ix3,jx2);
1813 dy32 = _mm_sub_ps(iy3,jy2);
1814 dz32 = _mm_sub_ps(iz3,jz2);
1815 dx33 = _mm_sub_ps(ix3,jx3);
1816 dy33 = _mm_sub_ps(iy3,jy3);
1817 dz33 = _mm_sub_ps(iz3,jz3);
1819 /* Calculate squared distance and things based on it */
1820 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1821 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1822 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1823 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1824 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1825 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1826 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1827 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1828 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1829 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1831 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1832 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1833 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1834 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1835 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1836 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1837 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1838 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1839 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1840 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1842 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1843 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1844 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1845 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1846 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1847 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1848 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1849 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1850 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1851 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1853 fjx0 = _mm_setzero_ps();
1854 fjy0 = _mm_setzero_ps();
1855 fjz0 = _mm_setzero_ps();
1856 fjx1 = _mm_setzero_ps();
1857 fjy1 = _mm_setzero_ps();
1858 fjz1 = _mm_setzero_ps();
1859 fjx2 = _mm_setzero_ps();
1860 fjy2 = _mm_setzero_ps();
1861 fjz2 = _mm_setzero_ps();
1862 fjx3 = _mm_setzero_ps();
1863 fjy3 = _mm_setzero_ps();
1864 fjz3 = _mm_setzero_ps();
1866 /**************************
1867 * CALCULATE INTERACTIONS *
1868 **************************/
1870 if (gmx_mm_any_lt(rsq00,rcutoff2))
1873 r00 = _mm_mul_ps(rsq00,rinv00);
1875 /* Analytical LJ-PME */
1876 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1877 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1878 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1879 exponent = gmx_simd_exp_r(ewcljrsq);
1880 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1881 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1882 /* f6A = 6 * C6grid * (1 - poly) */
1883 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1884 /* f6B = C6grid * exponent * beta^6 */
1885 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1886 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1887 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1889 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1893 fscal = _mm_and_ps(fscal,cutoff_mask);
1895 /* Calculate temporary vectorial force */
1896 tx = _mm_mul_ps(fscal,dx00);
1897 ty = _mm_mul_ps(fscal,dy00);
1898 tz = _mm_mul_ps(fscal,dz00);
1900 /* Update vectorial force */
1901 fix0 = _mm_add_ps(fix0,tx);
1902 fiy0 = _mm_add_ps(fiy0,ty);
1903 fiz0 = _mm_add_ps(fiz0,tz);
1905 fjx0 = _mm_add_ps(fjx0,tx);
1906 fjy0 = _mm_add_ps(fjy0,ty);
1907 fjz0 = _mm_add_ps(fjz0,tz);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 if (gmx_mm_any_lt(rsq11,rcutoff2))
1918 r11 = _mm_mul_ps(rsq11,rinv11);
1920 /* EWALD ELECTROSTATICS */
1922 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1923 ewrt = _mm_mul_ps(r11,ewtabscale);
1924 ewitab = _mm_cvttps_epi32(ewrt);
1925 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1926 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1927 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1929 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1930 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1932 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1936 fscal = _mm_and_ps(fscal,cutoff_mask);
1938 /* Calculate temporary vectorial force */
1939 tx = _mm_mul_ps(fscal,dx11);
1940 ty = _mm_mul_ps(fscal,dy11);
1941 tz = _mm_mul_ps(fscal,dz11);
1943 /* Update vectorial force */
1944 fix1 = _mm_add_ps(fix1,tx);
1945 fiy1 = _mm_add_ps(fiy1,ty);
1946 fiz1 = _mm_add_ps(fiz1,tz);
1948 fjx1 = _mm_add_ps(fjx1,tx);
1949 fjy1 = _mm_add_ps(fjy1,ty);
1950 fjz1 = _mm_add_ps(fjz1,tz);
1954 /**************************
1955 * CALCULATE INTERACTIONS *
1956 **************************/
1958 if (gmx_mm_any_lt(rsq12,rcutoff2))
1961 r12 = _mm_mul_ps(rsq12,rinv12);
1963 /* EWALD ELECTROSTATICS */
1965 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1966 ewrt = _mm_mul_ps(r12,ewtabscale);
1967 ewitab = _mm_cvttps_epi32(ewrt);
1968 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1969 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
1970 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
1972 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1973 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1975 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1979 fscal = _mm_and_ps(fscal,cutoff_mask);
1981 /* Calculate temporary vectorial force */
1982 tx = _mm_mul_ps(fscal,dx12);
1983 ty = _mm_mul_ps(fscal,dy12);
1984 tz = _mm_mul_ps(fscal,dz12);
1986 /* Update vectorial force */
1987 fix1 = _mm_add_ps(fix1,tx);
1988 fiy1 = _mm_add_ps(fiy1,ty);
1989 fiz1 = _mm_add_ps(fiz1,tz);
1991 fjx2 = _mm_add_ps(fjx2,tx);
1992 fjy2 = _mm_add_ps(fjy2,ty);
1993 fjz2 = _mm_add_ps(fjz2,tz);
1997 /**************************
1998 * CALCULATE INTERACTIONS *
1999 **************************/
2001 if (gmx_mm_any_lt(rsq13,rcutoff2))
2004 r13 = _mm_mul_ps(rsq13,rinv13);
2006 /* EWALD ELECTROSTATICS */
2008 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2009 ewrt = _mm_mul_ps(r13,ewtabscale);
2010 ewitab = _mm_cvttps_epi32(ewrt);
2011 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2012 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2013 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2015 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2016 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2018 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2022 fscal = _mm_and_ps(fscal,cutoff_mask);
2024 /* Calculate temporary vectorial force */
2025 tx = _mm_mul_ps(fscal,dx13);
2026 ty = _mm_mul_ps(fscal,dy13);
2027 tz = _mm_mul_ps(fscal,dz13);
2029 /* Update vectorial force */
2030 fix1 = _mm_add_ps(fix1,tx);
2031 fiy1 = _mm_add_ps(fiy1,ty);
2032 fiz1 = _mm_add_ps(fiz1,tz);
2034 fjx3 = _mm_add_ps(fjx3,tx);
2035 fjy3 = _mm_add_ps(fjy3,ty);
2036 fjz3 = _mm_add_ps(fjz3,tz);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 if (gmx_mm_any_lt(rsq21,rcutoff2))
2047 r21 = _mm_mul_ps(rsq21,rinv21);
2049 /* EWALD ELECTROSTATICS */
2051 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2052 ewrt = _mm_mul_ps(r21,ewtabscale);
2053 ewitab = _mm_cvttps_epi32(ewrt);
2054 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2055 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2056 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2058 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2059 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2061 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2065 fscal = _mm_and_ps(fscal,cutoff_mask);
2067 /* Calculate temporary vectorial force */
2068 tx = _mm_mul_ps(fscal,dx21);
2069 ty = _mm_mul_ps(fscal,dy21);
2070 tz = _mm_mul_ps(fscal,dz21);
2072 /* Update vectorial force */
2073 fix2 = _mm_add_ps(fix2,tx);
2074 fiy2 = _mm_add_ps(fiy2,ty);
2075 fiz2 = _mm_add_ps(fiz2,tz);
2077 fjx1 = _mm_add_ps(fjx1,tx);
2078 fjy1 = _mm_add_ps(fjy1,ty);
2079 fjz1 = _mm_add_ps(fjz1,tz);
2083 /**************************
2084 * CALCULATE INTERACTIONS *
2085 **************************/
2087 if (gmx_mm_any_lt(rsq22,rcutoff2))
2090 r22 = _mm_mul_ps(rsq22,rinv22);
2092 /* EWALD ELECTROSTATICS */
2094 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2095 ewrt = _mm_mul_ps(r22,ewtabscale);
2096 ewitab = _mm_cvttps_epi32(ewrt);
2097 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2098 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2099 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2101 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2102 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2104 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2108 fscal = _mm_and_ps(fscal,cutoff_mask);
2110 /* Calculate temporary vectorial force */
2111 tx = _mm_mul_ps(fscal,dx22);
2112 ty = _mm_mul_ps(fscal,dy22);
2113 tz = _mm_mul_ps(fscal,dz22);
2115 /* Update vectorial force */
2116 fix2 = _mm_add_ps(fix2,tx);
2117 fiy2 = _mm_add_ps(fiy2,ty);
2118 fiz2 = _mm_add_ps(fiz2,tz);
2120 fjx2 = _mm_add_ps(fjx2,tx);
2121 fjy2 = _mm_add_ps(fjy2,ty);
2122 fjz2 = _mm_add_ps(fjz2,tz);
2126 /**************************
2127 * CALCULATE INTERACTIONS *
2128 **************************/
2130 if (gmx_mm_any_lt(rsq23,rcutoff2))
2133 r23 = _mm_mul_ps(rsq23,rinv23);
2135 /* EWALD ELECTROSTATICS */
2137 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2138 ewrt = _mm_mul_ps(r23,ewtabscale);
2139 ewitab = _mm_cvttps_epi32(ewrt);
2140 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2141 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2142 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2144 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2145 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2147 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2151 fscal = _mm_and_ps(fscal,cutoff_mask);
2153 /* Calculate temporary vectorial force */
2154 tx = _mm_mul_ps(fscal,dx23);
2155 ty = _mm_mul_ps(fscal,dy23);
2156 tz = _mm_mul_ps(fscal,dz23);
2158 /* Update vectorial force */
2159 fix2 = _mm_add_ps(fix2,tx);
2160 fiy2 = _mm_add_ps(fiy2,ty);
2161 fiz2 = _mm_add_ps(fiz2,tz);
2163 fjx3 = _mm_add_ps(fjx3,tx);
2164 fjy3 = _mm_add_ps(fjy3,ty);
2165 fjz3 = _mm_add_ps(fjz3,tz);
2169 /**************************
2170 * CALCULATE INTERACTIONS *
2171 **************************/
2173 if (gmx_mm_any_lt(rsq31,rcutoff2))
2176 r31 = _mm_mul_ps(rsq31,rinv31);
2178 /* EWALD ELECTROSTATICS */
2180 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2181 ewrt = _mm_mul_ps(r31,ewtabscale);
2182 ewitab = _mm_cvttps_epi32(ewrt);
2183 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2184 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2185 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2187 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2188 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2190 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2194 fscal = _mm_and_ps(fscal,cutoff_mask);
2196 /* Calculate temporary vectorial force */
2197 tx = _mm_mul_ps(fscal,dx31);
2198 ty = _mm_mul_ps(fscal,dy31);
2199 tz = _mm_mul_ps(fscal,dz31);
2201 /* Update vectorial force */
2202 fix3 = _mm_add_ps(fix3,tx);
2203 fiy3 = _mm_add_ps(fiy3,ty);
2204 fiz3 = _mm_add_ps(fiz3,tz);
2206 fjx1 = _mm_add_ps(fjx1,tx);
2207 fjy1 = _mm_add_ps(fjy1,ty);
2208 fjz1 = _mm_add_ps(fjz1,tz);
2212 /**************************
2213 * CALCULATE INTERACTIONS *
2214 **************************/
2216 if (gmx_mm_any_lt(rsq32,rcutoff2))
2219 r32 = _mm_mul_ps(rsq32,rinv32);
2221 /* EWALD ELECTROSTATICS */
2223 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2224 ewrt = _mm_mul_ps(r32,ewtabscale);
2225 ewitab = _mm_cvttps_epi32(ewrt);
2226 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2227 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2228 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2230 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2231 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2233 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2237 fscal = _mm_and_ps(fscal,cutoff_mask);
2239 /* Calculate temporary vectorial force */
2240 tx = _mm_mul_ps(fscal,dx32);
2241 ty = _mm_mul_ps(fscal,dy32);
2242 tz = _mm_mul_ps(fscal,dz32);
2244 /* Update vectorial force */
2245 fix3 = _mm_add_ps(fix3,tx);
2246 fiy3 = _mm_add_ps(fiy3,ty);
2247 fiz3 = _mm_add_ps(fiz3,tz);
2249 fjx2 = _mm_add_ps(fjx2,tx);
2250 fjy2 = _mm_add_ps(fjy2,ty);
2251 fjz2 = _mm_add_ps(fjz2,tz);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 if (gmx_mm_any_lt(rsq33,rcutoff2))
2262 r33 = _mm_mul_ps(rsq33,rinv33);
2264 /* EWALD ELECTROSTATICS */
2266 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2267 ewrt = _mm_mul_ps(r33,ewtabscale);
2268 ewitab = _mm_cvttps_epi32(ewrt);
2269 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2270 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2271 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2273 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2274 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2276 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2280 fscal = _mm_and_ps(fscal,cutoff_mask);
2282 /* Calculate temporary vectorial force */
2283 tx = _mm_mul_ps(fscal,dx33);
2284 ty = _mm_mul_ps(fscal,dy33);
2285 tz = _mm_mul_ps(fscal,dz33);
2287 /* Update vectorial force */
2288 fix3 = _mm_add_ps(fix3,tx);
2289 fiy3 = _mm_add_ps(fiy3,ty);
2290 fiz3 = _mm_add_ps(fiz3,tz);
2292 fjx3 = _mm_add_ps(fjx3,tx);
2293 fjy3 = _mm_add_ps(fjy3,ty);
2294 fjz3 = _mm_add_ps(fjz3,tz);
2298 fjptrA = f+j_coord_offsetA;
2299 fjptrB = f+j_coord_offsetB;
2300 fjptrC = f+j_coord_offsetC;
2301 fjptrD = f+j_coord_offsetD;
2303 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2304 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2305 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2307 /* Inner loop uses 403 flops */
2310 if(jidx<j_index_end)
2313 /* Get j neighbor index, and coordinate index */
2314 jnrlistA = jjnr[jidx];
2315 jnrlistB = jjnr[jidx+1];
2316 jnrlistC = jjnr[jidx+2];
2317 jnrlistD = jjnr[jidx+3];
2318 /* Sign of each element will be negative for non-real atoms.
2319 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2320 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2322 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2323 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2324 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2325 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2326 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2327 j_coord_offsetA = DIM*jnrA;
2328 j_coord_offsetB = DIM*jnrB;
2329 j_coord_offsetC = DIM*jnrC;
2330 j_coord_offsetD = DIM*jnrD;
2332 /* load j atom coordinates */
2333 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2334 x+j_coord_offsetC,x+j_coord_offsetD,
2335 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2336 &jy2,&jz2,&jx3,&jy3,&jz3);
2338 /* Calculate displacement vector */
2339 dx00 = _mm_sub_ps(ix0,jx0);
2340 dy00 = _mm_sub_ps(iy0,jy0);
2341 dz00 = _mm_sub_ps(iz0,jz0);
2342 dx11 = _mm_sub_ps(ix1,jx1);
2343 dy11 = _mm_sub_ps(iy1,jy1);
2344 dz11 = _mm_sub_ps(iz1,jz1);
2345 dx12 = _mm_sub_ps(ix1,jx2);
2346 dy12 = _mm_sub_ps(iy1,jy2);
2347 dz12 = _mm_sub_ps(iz1,jz2);
2348 dx13 = _mm_sub_ps(ix1,jx3);
2349 dy13 = _mm_sub_ps(iy1,jy3);
2350 dz13 = _mm_sub_ps(iz1,jz3);
2351 dx21 = _mm_sub_ps(ix2,jx1);
2352 dy21 = _mm_sub_ps(iy2,jy1);
2353 dz21 = _mm_sub_ps(iz2,jz1);
2354 dx22 = _mm_sub_ps(ix2,jx2);
2355 dy22 = _mm_sub_ps(iy2,jy2);
2356 dz22 = _mm_sub_ps(iz2,jz2);
2357 dx23 = _mm_sub_ps(ix2,jx3);
2358 dy23 = _mm_sub_ps(iy2,jy3);
2359 dz23 = _mm_sub_ps(iz2,jz3);
2360 dx31 = _mm_sub_ps(ix3,jx1);
2361 dy31 = _mm_sub_ps(iy3,jy1);
2362 dz31 = _mm_sub_ps(iz3,jz1);
2363 dx32 = _mm_sub_ps(ix3,jx2);
2364 dy32 = _mm_sub_ps(iy3,jy2);
2365 dz32 = _mm_sub_ps(iz3,jz2);
2366 dx33 = _mm_sub_ps(ix3,jx3);
2367 dy33 = _mm_sub_ps(iy3,jy3);
2368 dz33 = _mm_sub_ps(iz3,jz3);
2370 /* Calculate squared distance and things based on it */
2371 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2372 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2373 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2374 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2375 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2376 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2377 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2378 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2379 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2380 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2382 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2383 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2384 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2385 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2386 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2387 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2388 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2389 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2390 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2391 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2393 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2394 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2395 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2396 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2397 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2398 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2399 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2400 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2401 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2402 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2404 fjx0 = _mm_setzero_ps();
2405 fjy0 = _mm_setzero_ps();
2406 fjz0 = _mm_setzero_ps();
2407 fjx1 = _mm_setzero_ps();
2408 fjy1 = _mm_setzero_ps();
2409 fjz1 = _mm_setzero_ps();
2410 fjx2 = _mm_setzero_ps();
2411 fjy2 = _mm_setzero_ps();
2412 fjz2 = _mm_setzero_ps();
2413 fjx3 = _mm_setzero_ps();
2414 fjy3 = _mm_setzero_ps();
2415 fjz3 = _mm_setzero_ps();
2417 /**************************
2418 * CALCULATE INTERACTIONS *
2419 **************************/
2421 if (gmx_mm_any_lt(rsq00,rcutoff2))
2424 r00 = _mm_mul_ps(rsq00,rinv00);
2425 r00 = _mm_andnot_ps(dummy_mask,r00);
2427 /* Analytical LJ-PME */
2428 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2429 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
2430 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2431 exponent = gmx_simd_exp_r(ewcljrsq);
2432 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2433 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2434 /* f6A = 6 * C6grid * (1 - poly) */
2435 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2436 /* f6B = C6grid * exponent * beta^6 */
2437 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2438 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2439 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2441 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2445 fscal = _mm_and_ps(fscal,cutoff_mask);
2447 fscal = _mm_andnot_ps(dummy_mask,fscal);
2449 /* Calculate temporary vectorial force */
2450 tx = _mm_mul_ps(fscal,dx00);
2451 ty = _mm_mul_ps(fscal,dy00);
2452 tz = _mm_mul_ps(fscal,dz00);
2454 /* Update vectorial force */
2455 fix0 = _mm_add_ps(fix0,tx);
2456 fiy0 = _mm_add_ps(fiy0,ty);
2457 fiz0 = _mm_add_ps(fiz0,tz);
2459 fjx0 = _mm_add_ps(fjx0,tx);
2460 fjy0 = _mm_add_ps(fjy0,ty);
2461 fjz0 = _mm_add_ps(fjz0,tz);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 if (gmx_mm_any_lt(rsq11,rcutoff2))
2472 r11 = _mm_mul_ps(rsq11,rinv11);
2473 r11 = _mm_andnot_ps(dummy_mask,r11);
2475 /* EWALD ELECTROSTATICS */
2477 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2478 ewrt = _mm_mul_ps(r11,ewtabscale);
2479 ewitab = _mm_cvttps_epi32(ewrt);
2480 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2481 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2482 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2484 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2485 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2487 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2491 fscal = _mm_and_ps(fscal,cutoff_mask);
2493 fscal = _mm_andnot_ps(dummy_mask,fscal);
2495 /* Calculate temporary vectorial force */
2496 tx = _mm_mul_ps(fscal,dx11);
2497 ty = _mm_mul_ps(fscal,dy11);
2498 tz = _mm_mul_ps(fscal,dz11);
2500 /* Update vectorial force */
2501 fix1 = _mm_add_ps(fix1,tx);
2502 fiy1 = _mm_add_ps(fiy1,ty);
2503 fiz1 = _mm_add_ps(fiz1,tz);
2505 fjx1 = _mm_add_ps(fjx1,tx);
2506 fjy1 = _mm_add_ps(fjy1,ty);
2507 fjz1 = _mm_add_ps(fjz1,tz);
2511 /**************************
2512 * CALCULATE INTERACTIONS *
2513 **************************/
2515 if (gmx_mm_any_lt(rsq12,rcutoff2))
2518 r12 = _mm_mul_ps(rsq12,rinv12);
2519 r12 = _mm_andnot_ps(dummy_mask,r12);
2521 /* EWALD ELECTROSTATICS */
2523 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2524 ewrt = _mm_mul_ps(r12,ewtabscale);
2525 ewitab = _mm_cvttps_epi32(ewrt);
2526 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2527 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2528 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2530 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2531 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2533 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2537 fscal = _mm_and_ps(fscal,cutoff_mask);
2539 fscal = _mm_andnot_ps(dummy_mask,fscal);
2541 /* Calculate temporary vectorial force */
2542 tx = _mm_mul_ps(fscal,dx12);
2543 ty = _mm_mul_ps(fscal,dy12);
2544 tz = _mm_mul_ps(fscal,dz12);
2546 /* Update vectorial force */
2547 fix1 = _mm_add_ps(fix1,tx);
2548 fiy1 = _mm_add_ps(fiy1,ty);
2549 fiz1 = _mm_add_ps(fiz1,tz);
2551 fjx2 = _mm_add_ps(fjx2,tx);
2552 fjy2 = _mm_add_ps(fjy2,ty);
2553 fjz2 = _mm_add_ps(fjz2,tz);
2557 /**************************
2558 * CALCULATE INTERACTIONS *
2559 **************************/
2561 if (gmx_mm_any_lt(rsq13,rcutoff2))
2564 r13 = _mm_mul_ps(rsq13,rinv13);
2565 r13 = _mm_andnot_ps(dummy_mask,r13);
2567 /* EWALD ELECTROSTATICS */
2569 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2570 ewrt = _mm_mul_ps(r13,ewtabscale);
2571 ewitab = _mm_cvttps_epi32(ewrt);
2572 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2573 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2574 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2576 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2577 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2579 cutoff_mask = _mm_cmplt_ps(rsq13,rcutoff2);
2583 fscal = _mm_and_ps(fscal,cutoff_mask);
2585 fscal = _mm_andnot_ps(dummy_mask,fscal);
2587 /* Calculate temporary vectorial force */
2588 tx = _mm_mul_ps(fscal,dx13);
2589 ty = _mm_mul_ps(fscal,dy13);
2590 tz = _mm_mul_ps(fscal,dz13);
2592 /* Update vectorial force */
2593 fix1 = _mm_add_ps(fix1,tx);
2594 fiy1 = _mm_add_ps(fiy1,ty);
2595 fiz1 = _mm_add_ps(fiz1,tz);
2597 fjx3 = _mm_add_ps(fjx3,tx);
2598 fjy3 = _mm_add_ps(fjy3,ty);
2599 fjz3 = _mm_add_ps(fjz3,tz);
2603 /**************************
2604 * CALCULATE INTERACTIONS *
2605 **************************/
2607 if (gmx_mm_any_lt(rsq21,rcutoff2))
2610 r21 = _mm_mul_ps(rsq21,rinv21);
2611 r21 = _mm_andnot_ps(dummy_mask,r21);
2613 /* EWALD ELECTROSTATICS */
2615 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2616 ewrt = _mm_mul_ps(r21,ewtabscale);
2617 ewitab = _mm_cvttps_epi32(ewrt);
2618 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2619 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2620 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2622 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2623 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2625 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2629 fscal = _mm_and_ps(fscal,cutoff_mask);
2631 fscal = _mm_andnot_ps(dummy_mask,fscal);
2633 /* Calculate temporary vectorial force */
2634 tx = _mm_mul_ps(fscal,dx21);
2635 ty = _mm_mul_ps(fscal,dy21);
2636 tz = _mm_mul_ps(fscal,dz21);
2638 /* Update vectorial force */
2639 fix2 = _mm_add_ps(fix2,tx);
2640 fiy2 = _mm_add_ps(fiy2,ty);
2641 fiz2 = _mm_add_ps(fiz2,tz);
2643 fjx1 = _mm_add_ps(fjx1,tx);
2644 fjy1 = _mm_add_ps(fjy1,ty);
2645 fjz1 = _mm_add_ps(fjz1,tz);
2649 /**************************
2650 * CALCULATE INTERACTIONS *
2651 **************************/
2653 if (gmx_mm_any_lt(rsq22,rcutoff2))
2656 r22 = _mm_mul_ps(rsq22,rinv22);
2657 r22 = _mm_andnot_ps(dummy_mask,r22);
2659 /* EWALD ELECTROSTATICS */
2661 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2662 ewrt = _mm_mul_ps(r22,ewtabscale);
2663 ewitab = _mm_cvttps_epi32(ewrt);
2664 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2665 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2666 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2668 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2669 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2671 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2675 fscal = _mm_and_ps(fscal,cutoff_mask);
2677 fscal = _mm_andnot_ps(dummy_mask,fscal);
2679 /* Calculate temporary vectorial force */
2680 tx = _mm_mul_ps(fscal,dx22);
2681 ty = _mm_mul_ps(fscal,dy22);
2682 tz = _mm_mul_ps(fscal,dz22);
2684 /* Update vectorial force */
2685 fix2 = _mm_add_ps(fix2,tx);
2686 fiy2 = _mm_add_ps(fiy2,ty);
2687 fiz2 = _mm_add_ps(fiz2,tz);
2689 fjx2 = _mm_add_ps(fjx2,tx);
2690 fjy2 = _mm_add_ps(fjy2,ty);
2691 fjz2 = _mm_add_ps(fjz2,tz);
2695 /**************************
2696 * CALCULATE INTERACTIONS *
2697 **************************/
2699 if (gmx_mm_any_lt(rsq23,rcutoff2))
2702 r23 = _mm_mul_ps(rsq23,rinv23);
2703 r23 = _mm_andnot_ps(dummy_mask,r23);
2705 /* EWALD ELECTROSTATICS */
2707 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2708 ewrt = _mm_mul_ps(r23,ewtabscale);
2709 ewitab = _mm_cvttps_epi32(ewrt);
2710 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2711 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2712 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2714 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2715 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2717 cutoff_mask = _mm_cmplt_ps(rsq23,rcutoff2);
2721 fscal = _mm_and_ps(fscal,cutoff_mask);
2723 fscal = _mm_andnot_ps(dummy_mask,fscal);
2725 /* Calculate temporary vectorial force */
2726 tx = _mm_mul_ps(fscal,dx23);
2727 ty = _mm_mul_ps(fscal,dy23);
2728 tz = _mm_mul_ps(fscal,dz23);
2730 /* Update vectorial force */
2731 fix2 = _mm_add_ps(fix2,tx);
2732 fiy2 = _mm_add_ps(fiy2,ty);
2733 fiz2 = _mm_add_ps(fiz2,tz);
2735 fjx3 = _mm_add_ps(fjx3,tx);
2736 fjy3 = _mm_add_ps(fjy3,ty);
2737 fjz3 = _mm_add_ps(fjz3,tz);
2741 /**************************
2742 * CALCULATE INTERACTIONS *
2743 **************************/
2745 if (gmx_mm_any_lt(rsq31,rcutoff2))
2748 r31 = _mm_mul_ps(rsq31,rinv31);
2749 r31 = _mm_andnot_ps(dummy_mask,r31);
2751 /* EWALD ELECTROSTATICS */
2753 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2754 ewrt = _mm_mul_ps(r31,ewtabscale);
2755 ewitab = _mm_cvttps_epi32(ewrt);
2756 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2757 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2758 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2760 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2761 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2763 cutoff_mask = _mm_cmplt_ps(rsq31,rcutoff2);
2767 fscal = _mm_and_ps(fscal,cutoff_mask);
2769 fscal = _mm_andnot_ps(dummy_mask,fscal);
2771 /* Calculate temporary vectorial force */
2772 tx = _mm_mul_ps(fscal,dx31);
2773 ty = _mm_mul_ps(fscal,dy31);
2774 tz = _mm_mul_ps(fscal,dz31);
2776 /* Update vectorial force */
2777 fix3 = _mm_add_ps(fix3,tx);
2778 fiy3 = _mm_add_ps(fiy3,ty);
2779 fiz3 = _mm_add_ps(fiz3,tz);
2781 fjx1 = _mm_add_ps(fjx1,tx);
2782 fjy1 = _mm_add_ps(fjy1,ty);
2783 fjz1 = _mm_add_ps(fjz1,tz);
2787 /**************************
2788 * CALCULATE INTERACTIONS *
2789 **************************/
2791 if (gmx_mm_any_lt(rsq32,rcutoff2))
2794 r32 = _mm_mul_ps(rsq32,rinv32);
2795 r32 = _mm_andnot_ps(dummy_mask,r32);
2797 /* EWALD ELECTROSTATICS */
2799 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2800 ewrt = _mm_mul_ps(r32,ewtabscale);
2801 ewitab = _mm_cvttps_epi32(ewrt);
2802 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2803 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2804 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2806 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2807 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2809 cutoff_mask = _mm_cmplt_ps(rsq32,rcutoff2);
2813 fscal = _mm_and_ps(fscal,cutoff_mask);
2815 fscal = _mm_andnot_ps(dummy_mask,fscal);
2817 /* Calculate temporary vectorial force */
2818 tx = _mm_mul_ps(fscal,dx32);
2819 ty = _mm_mul_ps(fscal,dy32);
2820 tz = _mm_mul_ps(fscal,dz32);
2822 /* Update vectorial force */
2823 fix3 = _mm_add_ps(fix3,tx);
2824 fiy3 = _mm_add_ps(fiy3,ty);
2825 fiz3 = _mm_add_ps(fiz3,tz);
2827 fjx2 = _mm_add_ps(fjx2,tx);
2828 fjy2 = _mm_add_ps(fjy2,ty);
2829 fjz2 = _mm_add_ps(fjz2,tz);
2833 /**************************
2834 * CALCULATE INTERACTIONS *
2835 **************************/
2837 if (gmx_mm_any_lt(rsq33,rcutoff2))
2840 r33 = _mm_mul_ps(rsq33,rinv33);
2841 r33 = _mm_andnot_ps(dummy_mask,r33);
2843 /* EWALD ELECTROSTATICS */
2845 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2846 ewrt = _mm_mul_ps(r33,ewtabscale);
2847 ewitab = _mm_cvttps_epi32(ewrt);
2848 eweps = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2849 gmx_mm_load_4pair_swizzle_ps(ewtab + gmx_mm_extract_epi32(ewitab,0),ewtab + gmx_mm_extract_epi32(ewitab,1),
2850 ewtab + gmx_mm_extract_epi32(ewitab,2),ewtab + gmx_mm_extract_epi32(ewitab,3),
2852 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2853 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2855 cutoff_mask = _mm_cmplt_ps(rsq33,rcutoff2);
2859 fscal = _mm_and_ps(fscal,cutoff_mask);
2861 fscal = _mm_andnot_ps(dummy_mask,fscal);
2863 /* Calculate temporary vectorial force */
2864 tx = _mm_mul_ps(fscal,dx33);
2865 ty = _mm_mul_ps(fscal,dy33);
2866 tz = _mm_mul_ps(fscal,dz33);
2868 /* Update vectorial force */
2869 fix3 = _mm_add_ps(fix3,tx);
2870 fiy3 = _mm_add_ps(fiy3,ty);
2871 fiz3 = _mm_add_ps(fiz3,tz);
2873 fjx3 = _mm_add_ps(fjx3,tx);
2874 fjy3 = _mm_add_ps(fjy3,ty);
2875 fjz3 = _mm_add_ps(fjz3,tz);
2879 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2880 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2881 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2882 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2884 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2885 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2886 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2888 /* Inner loop uses 413 flops */
2891 /* End of innermost loop */
2893 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2894 f+i_coord_offset,fshift+i_shift_offset);
2896 /* Increment number of inner iterations */
2897 inneriter += j_index_end - j_index_start;
2899 /* Outer loop uses 24 flops */
2902 /* Increment number of outer iterations */
2905 /* Update outer/inner flops */
2907 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*413);