2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS sse2_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LJEwald
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_VF_sse2_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
107 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
111 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
121 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
123 __m128 one_half = _mm_set1_ps(0.5);
124 __m128 minus_one = _mm_set1_ps(-1.0);
126 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
128 __m128 dummy_mask,cutoff_mask;
129 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
130 __m128 one = _mm_set1_ps(1.0);
131 __m128 two = _mm_set1_ps(2.0);
137 jindex = nlist->jindex;
139 shiftidx = nlist->shift;
141 shiftvec = fr->shift_vec[0];
142 fshift = fr->fshift[0];
143 facel = _mm_set1_ps(fr->epsfac);
144 charge = mdatoms->chargeA;
145 nvdwtype = fr->ntype;
147 vdwtype = mdatoms->typeA;
148 vdwgridparam = fr->ljpme_c6grid;
149 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
150 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
151 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
153 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
154 ewtab = fr->ic->tabq_coul_FDV0;
155 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
156 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
158 /* Setup water-specific parameters */
159 inr = nlist->iinr[0];
160 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
161 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
162 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
163 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
165 jq0 = _mm_set1_ps(charge[inr+0]);
166 jq1 = _mm_set1_ps(charge[inr+1]);
167 jq2 = _mm_set1_ps(charge[inr+2]);
168 vdwjidx0A = 2*vdwtype[inr+0];
169 qq00 = _mm_mul_ps(iq0,jq0);
170 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
171 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
172 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
173 qq01 = _mm_mul_ps(iq0,jq1);
174 qq02 = _mm_mul_ps(iq0,jq2);
175 qq10 = _mm_mul_ps(iq1,jq0);
176 qq11 = _mm_mul_ps(iq1,jq1);
177 qq12 = _mm_mul_ps(iq1,jq2);
178 qq20 = _mm_mul_ps(iq2,jq0);
179 qq21 = _mm_mul_ps(iq2,jq1);
180 qq22 = _mm_mul_ps(iq2,jq2);
182 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
183 rcutoff_scalar = fr->rcoulomb;
184 rcutoff = _mm_set1_ps(rcutoff_scalar);
185 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
187 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
188 rvdw = _mm_set1_ps(fr->rvdw);
190 /* Avoid stupid compiler warnings */
191 jnrA = jnrB = jnrC = jnrD = 0;
200 for(iidx=0;iidx<4*DIM;iidx++)
205 /* Start outer loop over neighborlists */
206 for(iidx=0; iidx<nri; iidx++)
208 /* Load shift vector for this list */
209 i_shift_offset = DIM*shiftidx[iidx];
211 /* Load limits for loop over neighbors */
212 j_index_start = jindex[iidx];
213 j_index_end = jindex[iidx+1];
215 /* Get outer coordinate index */
217 i_coord_offset = DIM*inr;
219 /* Load i particle coords and add shift vector */
220 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
221 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
223 fix0 = _mm_setzero_ps();
224 fiy0 = _mm_setzero_ps();
225 fiz0 = _mm_setzero_ps();
226 fix1 = _mm_setzero_ps();
227 fiy1 = _mm_setzero_ps();
228 fiz1 = _mm_setzero_ps();
229 fix2 = _mm_setzero_ps();
230 fiy2 = _mm_setzero_ps();
231 fiz2 = _mm_setzero_ps();
233 /* Reset potential sums */
234 velecsum = _mm_setzero_ps();
235 vvdwsum = _mm_setzero_ps();
237 /* Start inner kernel loop */
238 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
241 /* Get j neighbor index, and coordinate index */
246 j_coord_offsetA = DIM*jnrA;
247 j_coord_offsetB = DIM*jnrB;
248 j_coord_offsetC = DIM*jnrC;
249 j_coord_offsetD = DIM*jnrD;
251 /* load j atom coordinates */
252 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
253 x+j_coord_offsetC,x+j_coord_offsetD,
254 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
256 /* Calculate displacement vector */
257 dx00 = _mm_sub_ps(ix0,jx0);
258 dy00 = _mm_sub_ps(iy0,jy0);
259 dz00 = _mm_sub_ps(iz0,jz0);
260 dx01 = _mm_sub_ps(ix0,jx1);
261 dy01 = _mm_sub_ps(iy0,jy1);
262 dz01 = _mm_sub_ps(iz0,jz1);
263 dx02 = _mm_sub_ps(ix0,jx2);
264 dy02 = _mm_sub_ps(iy0,jy2);
265 dz02 = _mm_sub_ps(iz0,jz2);
266 dx10 = _mm_sub_ps(ix1,jx0);
267 dy10 = _mm_sub_ps(iy1,jy0);
268 dz10 = _mm_sub_ps(iz1,jz0);
269 dx11 = _mm_sub_ps(ix1,jx1);
270 dy11 = _mm_sub_ps(iy1,jy1);
271 dz11 = _mm_sub_ps(iz1,jz1);
272 dx12 = _mm_sub_ps(ix1,jx2);
273 dy12 = _mm_sub_ps(iy1,jy2);
274 dz12 = _mm_sub_ps(iz1,jz2);
275 dx20 = _mm_sub_ps(ix2,jx0);
276 dy20 = _mm_sub_ps(iy2,jy0);
277 dz20 = _mm_sub_ps(iz2,jz0);
278 dx21 = _mm_sub_ps(ix2,jx1);
279 dy21 = _mm_sub_ps(iy2,jy1);
280 dz21 = _mm_sub_ps(iz2,jz1);
281 dx22 = _mm_sub_ps(ix2,jx2);
282 dy22 = _mm_sub_ps(iy2,jy2);
283 dz22 = _mm_sub_ps(iz2,jz2);
285 /* Calculate squared distance and things based on it */
286 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
287 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
288 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
289 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
290 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
291 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
292 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
293 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
294 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
296 rinv00 = gmx_mm_invsqrt_ps(rsq00);
297 rinv01 = gmx_mm_invsqrt_ps(rsq01);
298 rinv02 = gmx_mm_invsqrt_ps(rsq02);
299 rinv10 = gmx_mm_invsqrt_ps(rsq10);
300 rinv11 = gmx_mm_invsqrt_ps(rsq11);
301 rinv12 = gmx_mm_invsqrt_ps(rsq12);
302 rinv20 = gmx_mm_invsqrt_ps(rsq20);
303 rinv21 = gmx_mm_invsqrt_ps(rsq21);
304 rinv22 = gmx_mm_invsqrt_ps(rsq22);
306 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
307 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
308 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
309 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
310 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
311 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
312 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
313 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
314 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
316 fjx0 = _mm_setzero_ps();
317 fjy0 = _mm_setzero_ps();
318 fjz0 = _mm_setzero_ps();
319 fjx1 = _mm_setzero_ps();
320 fjy1 = _mm_setzero_ps();
321 fjz1 = _mm_setzero_ps();
322 fjx2 = _mm_setzero_ps();
323 fjy2 = _mm_setzero_ps();
324 fjz2 = _mm_setzero_ps();
326 /**************************
327 * CALCULATE INTERACTIONS *
328 **************************/
330 if (gmx_mm_any_lt(rsq00,rcutoff2))
333 r00 = _mm_mul_ps(rsq00,rinv00);
335 /* EWALD ELECTROSTATICS */
337 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
338 ewrt = _mm_mul_ps(r00,ewtabscale);
339 ewitab = _mm_cvttps_epi32(ewrt);
340 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
341 ewitab = _mm_slli_epi32(ewitab,2);
342 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
343 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
344 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
345 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
346 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
347 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
348 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
349 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
350 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
352 /* Analytical LJ-PME */
353 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
354 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
355 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
356 exponent = gmx_simd_exp_r(ewcljrsq);
357 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
358 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
359 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
360 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
361 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
362 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
363 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
364 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
365 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
367 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
369 /* Update potential sum for this i atom from the interaction with this j atom. */
370 velec = _mm_and_ps(velec,cutoff_mask);
371 velecsum = _mm_add_ps(velecsum,velec);
372 vvdw = _mm_and_ps(vvdw,cutoff_mask);
373 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
375 fscal = _mm_add_ps(felec,fvdw);
377 fscal = _mm_and_ps(fscal,cutoff_mask);
379 /* Calculate temporary vectorial force */
380 tx = _mm_mul_ps(fscal,dx00);
381 ty = _mm_mul_ps(fscal,dy00);
382 tz = _mm_mul_ps(fscal,dz00);
384 /* Update vectorial force */
385 fix0 = _mm_add_ps(fix0,tx);
386 fiy0 = _mm_add_ps(fiy0,ty);
387 fiz0 = _mm_add_ps(fiz0,tz);
389 fjx0 = _mm_add_ps(fjx0,tx);
390 fjy0 = _mm_add_ps(fjy0,ty);
391 fjz0 = _mm_add_ps(fjz0,tz);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 if (gmx_mm_any_lt(rsq01,rcutoff2))
402 r01 = _mm_mul_ps(rsq01,rinv01);
404 /* EWALD ELECTROSTATICS */
406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
407 ewrt = _mm_mul_ps(r01,ewtabscale);
408 ewitab = _mm_cvttps_epi32(ewrt);
409 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
410 ewitab = _mm_slli_epi32(ewitab,2);
411 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
412 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
413 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
414 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
415 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
416 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
417 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
418 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
419 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
421 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
423 /* Update potential sum for this i atom from the interaction with this j atom. */
424 velec = _mm_and_ps(velec,cutoff_mask);
425 velecsum = _mm_add_ps(velecsum,velec);
429 fscal = _mm_and_ps(fscal,cutoff_mask);
431 /* Calculate temporary vectorial force */
432 tx = _mm_mul_ps(fscal,dx01);
433 ty = _mm_mul_ps(fscal,dy01);
434 tz = _mm_mul_ps(fscal,dz01);
436 /* Update vectorial force */
437 fix0 = _mm_add_ps(fix0,tx);
438 fiy0 = _mm_add_ps(fiy0,ty);
439 fiz0 = _mm_add_ps(fiz0,tz);
441 fjx1 = _mm_add_ps(fjx1,tx);
442 fjy1 = _mm_add_ps(fjy1,ty);
443 fjz1 = _mm_add_ps(fjz1,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 if (gmx_mm_any_lt(rsq02,rcutoff2))
454 r02 = _mm_mul_ps(rsq02,rinv02);
456 /* EWALD ELECTROSTATICS */
458 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
459 ewrt = _mm_mul_ps(r02,ewtabscale);
460 ewitab = _mm_cvttps_epi32(ewrt);
461 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
462 ewitab = _mm_slli_epi32(ewitab,2);
463 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
464 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
465 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
466 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
467 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
468 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
469 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
470 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
471 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
473 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
475 /* Update potential sum for this i atom from the interaction with this j atom. */
476 velec = _mm_and_ps(velec,cutoff_mask);
477 velecsum = _mm_add_ps(velecsum,velec);
481 fscal = _mm_and_ps(fscal,cutoff_mask);
483 /* Calculate temporary vectorial force */
484 tx = _mm_mul_ps(fscal,dx02);
485 ty = _mm_mul_ps(fscal,dy02);
486 tz = _mm_mul_ps(fscal,dz02);
488 /* Update vectorial force */
489 fix0 = _mm_add_ps(fix0,tx);
490 fiy0 = _mm_add_ps(fiy0,ty);
491 fiz0 = _mm_add_ps(fiz0,tz);
493 fjx2 = _mm_add_ps(fjx2,tx);
494 fjy2 = _mm_add_ps(fjy2,ty);
495 fjz2 = _mm_add_ps(fjz2,tz);
499 /**************************
500 * CALCULATE INTERACTIONS *
501 **************************/
503 if (gmx_mm_any_lt(rsq10,rcutoff2))
506 r10 = _mm_mul_ps(rsq10,rinv10);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_ps(r10,ewtabscale);
512 ewitab = _mm_cvttps_epi32(ewrt);
513 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
523 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
525 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
527 /* Update potential sum for this i atom from the interaction with this j atom. */
528 velec = _mm_and_ps(velec,cutoff_mask);
529 velecsum = _mm_add_ps(velecsum,velec);
533 fscal = _mm_and_ps(fscal,cutoff_mask);
535 /* Calculate temporary vectorial force */
536 tx = _mm_mul_ps(fscal,dx10);
537 ty = _mm_mul_ps(fscal,dy10);
538 tz = _mm_mul_ps(fscal,dz10);
540 /* Update vectorial force */
541 fix1 = _mm_add_ps(fix1,tx);
542 fiy1 = _mm_add_ps(fiy1,ty);
543 fiz1 = _mm_add_ps(fiz1,tz);
545 fjx0 = _mm_add_ps(fjx0,tx);
546 fjy0 = _mm_add_ps(fjy0,ty);
547 fjz0 = _mm_add_ps(fjz0,tz);
551 /**************************
552 * CALCULATE INTERACTIONS *
553 **************************/
555 if (gmx_mm_any_lt(rsq11,rcutoff2))
558 r11 = _mm_mul_ps(rsq11,rinv11);
560 /* EWALD ELECTROSTATICS */
562 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
563 ewrt = _mm_mul_ps(r11,ewtabscale);
564 ewitab = _mm_cvttps_epi32(ewrt);
565 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
566 ewitab = _mm_slli_epi32(ewitab,2);
567 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
568 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
569 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
570 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
571 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
572 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
573 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
574 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
575 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
577 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
579 /* Update potential sum for this i atom from the interaction with this j atom. */
580 velec = _mm_and_ps(velec,cutoff_mask);
581 velecsum = _mm_add_ps(velecsum,velec);
585 fscal = _mm_and_ps(fscal,cutoff_mask);
587 /* Calculate temporary vectorial force */
588 tx = _mm_mul_ps(fscal,dx11);
589 ty = _mm_mul_ps(fscal,dy11);
590 tz = _mm_mul_ps(fscal,dz11);
592 /* Update vectorial force */
593 fix1 = _mm_add_ps(fix1,tx);
594 fiy1 = _mm_add_ps(fiy1,ty);
595 fiz1 = _mm_add_ps(fiz1,tz);
597 fjx1 = _mm_add_ps(fjx1,tx);
598 fjy1 = _mm_add_ps(fjy1,ty);
599 fjz1 = _mm_add_ps(fjz1,tz);
603 /**************************
604 * CALCULATE INTERACTIONS *
605 **************************/
607 if (gmx_mm_any_lt(rsq12,rcutoff2))
610 r12 = _mm_mul_ps(rsq12,rinv12);
612 /* EWALD ELECTROSTATICS */
614 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
615 ewrt = _mm_mul_ps(r12,ewtabscale);
616 ewitab = _mm_cvttps_epi32(ewrt);
617 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
618 ewitab = _mm_slli_epi32(ewitab,2);
619 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
620 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
621 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
622 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
623 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
624 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
625 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
626 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
627 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
629 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
631 /* Update potential sum for this i atom from the interaction with this j atom. */
632 velec = _mm_and_ps(velec,cutoff_mask);
633 velecsum = _mm_add_ps(velecsum,velec);
637 fscal = _mm_and_ps(fscal,cutoff_mask);
639 /* Calculate temporary vectorial force */
640 tx = _mm_mul_ps(fscal,dx12);
641 ty = _mm_mul_ps(fscal,dy12);
642 tz = _mm_mul_ps(fscal,dz12);
644 /* Update vectorial force */
645 fix1 = _mm_add_ps(fix1,tx);
646 fiy1 = _mm_add_ps(fiy1,ty);
647 fiz1 = _mm_add_ps(fiz1,tz);
649 fjx2 = _mm_add_ps(fjx2,tx);
650 fjy2 = _mm_add_ps(fjy2,ty);
651 fjz2 = _mm_add_ps(fjz2,tz);
655 /**************************
656 * CALCULATE INTERACTIONS *
657 **************************/
659 if (gmx_mm_any_lt(rsq20,rcutoff2))
662 r20 = _mm_mul_ps(rsq20,rinv20);
664 /* EWALD ELECTROSTATICS */
666 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
667 ewrt = _mm_mul_ps(r20,ewtabscale);
668 ewitab = _mm_cvttps_epi32(ewrt);
669 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
670 ewitab = _mm_slli_epi32(ewitab,2);
671 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
672 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
673 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
674 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
675 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
676 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
677 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
678 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
679 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
681 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
683 /* Update potential sum for this i atom from the interaction with this j atom. */
684 velec = _mm_and_ps(velec,cutoff_mask);
685 velecsum = _mm_add_ps(velecsum,velec);
689 fscal = _mm_and_ps(fscal,cutoff_mask);
691 /* Calculate temporary vectorial force */
692 tx = _mm_mul_ps(fscal,dx20);
693 ty = _mm_mul_ps(fscal,dy20);
694 tz = _mm_mul_ps(fscal,dz20);
696 /* Update vectorial force */
697 fix2 = _mm_add_ps(fix2,tx);
698 fiy2 = _mm_add_ps(fiy2,ty);
699 fiz2 = _mm_add_ps(fiz2,tz);
701 fjx0 = _mm_add_ps(fjx0,tx);
702 fjy0 = _mm_add_ps(fjy0,ty);
703 fjz0 = _mm_add_ps(fjz0,tz);
707 /**************************
708 * CALCULATE INTERACTIONS *
709 **************************/
711 if (gmx_mm_any_lt(rsq21,rcutoff2))
714 r21 = _mm_mul_ps(rsq21,rinv21);
716 /* EWALD ELECTROSTATICS */
718 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
719 ewrt = _mm_mul_ps(r21,ewtabscale);
720 ewitab = _mm_cvttps_epi32(ewrt);
721 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
722 ewitab = _mm_slli_epi32(ewitab,2);
723 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
724 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
725 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
726 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
727 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
728 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
729 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
730 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
731 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
733 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velec = _mm_and_ps(velec,cutoff_mask);
737 velecsum = _mm_add_ps(velecsum,velec);
741 fscal = _mm_and_ps(fscal,cutoff_mask);
743 /* Calculate temporary vectorial force */
744 tx = _mm_mul_ps(fscal,dx21);
745 ty = _mm_mul_ps(fscal,dy21);
746 tz = _mm_mul_ps(fscal,dz21);
748 /* Update vectorial force */
749 fix2 = _mm_add_ps(fix2,tx);
750 fiy2 = _mm_add_ps(fiy2,ty);
751 fiz2 = _mm_add_ps(fiz2,tz);
753 fjx1 = _mm_add_ps(fjx1,tx);
754 fjy1 = _mm_add_ps(fjy1,ty);
755 fjz1 = _mm_add_ps(fjz1,tz);
759 /**************************
760 * CALCULATE INTERACTIONS *
761 **************************/
763 if (gmx_mm_any_lt(rsq22,rcutoff2))
766 r22 = _mm_mul_ps(rsq22,rinv22);
768 /* EWALD ELECTROSTATICS */
770 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
771 ewrt = _mm_mul_ps(r22,ewtabscale);
772 ewitab = _mm_cvttps_epi32(ewrt);
773 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
774 ewitab = _mm_slli_epi32(ewitab,2);
775 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
776 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
777 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
778 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
779 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
780 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
781 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
782 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
783 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
785 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
787 /* Update potential sum for this i atom from the interaction with this j atom. */
788 velec = _mm_and_ps(velec,cutoff_mask);
789 velecsum = _mm_add_ps(velecsum,velec);
793 fscal = _mm_and_ps(fscal,cutoff_mask);
795 /* Calculate temporary vectorial force */
796 tx = _mm_mul_ps(fscal,dx22);
797 ty = _mm_mul_ps(fscal,dy22);
798 tz = _mm_mul_ps(fscal,dz22);
800 /* Update vectorial force */
801 fix2 = _mm_add_ps(fix2,tx);
802 fiy2 = _mm_add_ps(fiy2,ty);
803 fiz2 = _mm_add_ps(fiz2,tz);
805 fjx2 = _mm_add_ps(fjx2,tx);
806 fjy2 = _mm_add_ps(fjy2,ty);
807 fjz2 = _mm_add_ps(fjz2,tz);
811 fjptrA = f+j_coord_offsetA;
812 fjptrB = f+j_coord_offsetB;
813 fjptrC = f+j_coord_offsetC;
814 fjptrD = f+j_coord_offsetD;
816 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
817 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
819 /* Inner loop uses 450 flops */
825 /* Get j neighbor index, and coordinate index */
826 jnrlistA = jjnr[jidx];
827 jnrlistB = jjnr[jidx+1];
828 jnrlistC = jjnr[jidx+2];
829 jnrlistD = jjnr[jidx+3];
830 /* Sign of each element will be negative for non-real atoms.
831 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
832 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
834 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
835 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
836 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
837 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
838 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
839 j_coord_offsetA = DIM*jnrA;
840 j_coord_offsetB = DIM*jnrB;
841 j_coord_offsetC = DIM*jnrC;
842 j_coord_offsetD = DIM*jnrD;
844 /* load j atom coordinates */
845 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
846 x+j_coord_offsetC,x+j_coord_offsetD,
847 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
849 /* Calculate displacement vector */
850 dx00 = _mm_sub_ps(ix0,jx0);
851 dy00 = _mm_sub_ps(iy0,jy0);
852 dz00 = _mm_sub_ps(iz0,jz0);
853 dx01 = _mm_sub_ps(ix0,jx1);
854 dy01 = _mm_sub_ps(iy0,jy1);
855 dz01 = _mm_sub_ps(iz0,jz1);
856 dx02 = _mm_sub_ps(ix0,jx2);
857 dy02 = _mm_sub_ps(iy0,jy2);
858 dz02 = _mm_sub_ps(iz0,jz2);
859 dx10 = _mm_sub_ps(ix1,jx0);
860 dy10 = _mm_sub_ps(iy1,jy0);
861 dz10 = _mm_sub_ps(iz1,jz0);
862 dx11 = _mm_sub_ps(ix1,jx1);
863 dy11 = _mm_sub_ps(iy1,jy1);
864 dz11 = _mm_sub_ps(iz1,jz1);
865 dx12 = _mm_sub_ps(ix1,jx2);
866 dy12 = _mm_sub_ps(iy1,jy2);
867 dz12 = _mm_sub_ps(iz1,jz2);
868 dx20 = _mm_sub_ps(ix2,jx0);
869 dy20 = _mm_sub_ps(iy2,jy0);
870 dz20 = _mm_sub_ps(iz2,jz0);
871 dx21 = _mm_sub_ps(ix2,jx1);
872 dy21 = _mm_sub_ps(iy2,jy1);
873 dz21 = _mm_sub_ps(iz2,jz1);
874 dx22 = _mm_sub_ps(ix2,jx2);
875 dy22 = _mm_sub_ps(iy2,jy2);
876 dz22 = _mm_sub_ps(iz2,jz2);
878 /* Calculate squared distance and things based on it */
879 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
880 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
881 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
882 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
883 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
884 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
885 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
886 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
887 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
889 rinv00 = gmx_mm_invsqrt_ps(rsq00);
890 rinv01 = gmx_mm_invsqrt_ps(rsq01);
891 rinv02 = gmx_mm_invsqrt_ps(rsq02);
892 rinv10 = gmx_mm_invsqrt_ps(rsq10);
893 rinv11 = gmx_mm_invsqrt_ps(rsq11);
894 rinv12 = gmx_mm_invsqrt_ps(rsq12);
895 rinv20 = gmx_mm_invsqrt_ps(rsq20);
896 rinv21 = gmx_mm_invsqrt_ps(rsq21);
897 rinv22 = gmx_mm_invsqrt_ps(rsq22);
899 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
900 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
901 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
902 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
903 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
904 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
905 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
906 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
907 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
909 fjx0 = _mm_setzero_ps();
910 fjy0 = _mm_setzero_ps();
911 fjz0 = _mm_setzero_ps();
912 fjx1 = _mm_setzero_ps();
913 fjy1 = _mm_setzero_ps();
914 fjz1 = _mm_setzero_ps();
915 fjx2 = _mm_setzero_ps();
916 fjy2 = _mm_setzero_ps();
917 fjz2 = _mm_setzero_ps();
919 /**************************
920 * CALCULATE INTERACTIONS *
921 **************************/
923 if (gmx_mm_any_lt(rsq00,rcutoff2))
926 r00 = _mm_mul_ps(rsq00,rinv00);
927 r00 = _mm_andnot_ps(dummy_mask,r00);
929 /* EWALD ELECTROSTATICS */
931 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
932 ewrt = _mm_mul_ps(r00,ewtabscale);
933 ewitab = _mm_cvttps_epi32(ewrt);
934 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
935 ewitab = _mm_slli_epi32(ewitab,2);
936 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
937 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
938 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
939 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
940 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
941 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
942 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
943 velec = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
944 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
946 /* Analytical LJ-PME */
947 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
948 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
949 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
950 exponent = gmx_simd_exp_r(ewcljrsq);
951 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
952 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
953 /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
954 vvdw6 = _mm_mul_ps(_mm_sub_ps(c6_00,_mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly))),rinvsix);
955 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
956 vvdw = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
957 _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_add_ps(_mm_mul_ps(c6_00,sh_vdw_invrcut6),_mm_mul_ps(c6grid_00,sh_lj_ewald))),one_sixth));
958 /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
959 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,_mm_sub_ps(vvdw6,_mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6)))),rinvsq00);
961 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
963 /* Update potential sum for this i atom from the interaction with this j atom. */
964 velec = _mm_and_ps(velec,cutoff_mask);
965 velec = _mm_andnot_ps(dummy_mask,velec);
966 velecsum = _mm_add_ps(velecsum,velec);
967 vvdw = _mm_and_ps(vvdw,cutoff_mask);
968 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
969 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
971 fscal = _mm_add_ps(felec,fvdw);
973 fscal = _mm_and_ps(fscal,cutoff_mask);
975 fscal = _mm_andnot_ps(dummy_mask,fscal);
977 /* Calculate temporary vectorial force */
978 tx = _mm_mul_ps(fscal,dx00);
979 ty = _mm_mul_ps(fscal,dy00);
980 tz = _mm_mul_ps(fscal,dz00);
982 /* Update vectorial force */
983 fix0 = _mm_add_ps(fix0,tx);
984 fiy0 = _mm_add_ps(fiy0,ty);
985 fiz0 = _mm_add_ps(fiz0,tz);
987 fjx0 = _mm_add_ps(fjx0,tx);
988 fjy0 = _mm_add_ps(fjy0,ty);
989 fjz0 = _mm_add_ps(fjz0,tz);
993 /**************************
994 * CALCULATE INTERACTIONS *
995 **************************/
997 if (gmx_mm_any_lt(rsq01,rcutoff2))
1000 r01 = _mm_mul_ps(rsq01,rinv01);
1001 r01 = _mm_andnot_ps(dummy_mask,r01);
1003 /* EWALD ELECTROSTATICS */
1005 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1006 ewrt = _mm_mul_ps(r01,ewtabscale);
1007 ewitab = _mm_cvttps_epi32(ewrt);
1008 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1009 ewitab = _mm_slli_epi32(ewitab,2);
1010 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1011 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1012 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1013 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1014 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1015 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1016 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1017 velec = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
1018 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1020 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1022 /* Update potential sum for this i atom from the interaction with this j atom. */
1023 velec = _mm_and_ps(velec,cutoff_mask);
1024 velec = _mm_andnot_ps(dummy_mask,velec);
1025 velecsum = _mm_add_ps(velecsum,velec);
1029 fscal = _mm_and_ps(fscal,cutoff_mask);
1031 fscal = _mm_andnot_ps(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm_mul_ps(fscal,dx01);
1035 ty = _mm_mul_ps(fscal,dy01);
1036 tz = _mm_mul_ps(fscal,dz01);
1038 /* Update vectorial force */
1039 fix0 = _mm_add_ps(fix0,tx);
1040 fiy0 = _mm_add_ps(fiy0,ty);
1041 fiz0 = _mm_add_ps(fiz0,tz);
1043 fjx1 = _mm_add_ps(fjx1,tx);
1044 fjy1 = _mm_add_ps(fjy1,ty);
1045 fjz1 = _mm_add_ps(fjz1,tz);
1049 /**************************
1050 * CALCULATE INTERACTIONS *
1051 **************************/
1053 if (gmx_mm_any_lt(rsq02,rcutoff2))
1056 r02 = _mm_mul_ps(rsq02,rinv02);
1057 r02 = _mm_andnot_ps(dummy_mask,r02);
1059 /* EWALD ELECTROSTATICS */
1061 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1062 ewrt = _mm_mul_ps(r02,ewtabscale);
1063 ewitab = _mm_cvttps_epi32(ewrt);
1064 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1065 ewitab = _mm_slli_epi32(ewitab,2);
1066 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1067 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1068 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1069 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1070 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1071 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1072 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1073 velec = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1074 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1076 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1078 /* Update potential sum for this i atom from the interaction with this j atom. */
1079 velec = _mm_and_ps(velec,cutoff_mask);
1080 velec = _mm_andnot_ps(dummy_mask,velec);
1081 velecsum = _mm_add_ps(velecsum,velec);
1085 fscal = _mm_and_ps(fscal,cutoff_mask);
1087 fscal = _mm_andnot_ps(dummy_mask,fscal);
1089 /* Calculate temporary vectorial force */
1090 tx = _mm_mul_ps(fscal,dx02);
1091 ty = _mm_mul_ps(fscal,dy02);
1092 tz = _mm_mul_ps(fscal,dz02);
1094 /* Update vectorial force */
1095 fix0 = _mm_add_ps(fix0,tx);
1096 fiy0 = _mm_add_ps(fiy0,ty);
1097 fiz0 = _mm_add_ps(fiz0,tz);
1099 fjx2 = _mm_add_ps(fjx2,tx);
1100 fjy2 = _mm_add_ps(fjy2,ty);
1101 fjz2 = _mm_add_ps(fjz2,tz);
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 if (gmx_mm_any_lt(rsq10,rcutoff2))
1112 r10 = _mm_mul_ps(rsq10,rinv10);
1113 r10 = _mm_andnot_ps(dummy_mask,r10);
1115 /* EWALD ELECTROSTATICS */
1117 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1118 ewrt = _mm_mul_ps(r10,ewtabscale);
1119 ewitab = _mm_cvttps_epi32(ewrt);
1120 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1121 ewitab = _mm_slli_epi32(ewitab,2);
1122 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1123 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1124 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1125 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1126 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1127 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1128 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1129 velec = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1130 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1132 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1134 /* Update potential sum for this i atom from the interaction with this j atom. */
1135 velec = _mm_and_ps(velec,cutoff_mask);
1136 velec = _mm_andnot_ps(dummy_mask,velec);
1137 velecsum = _mm_add_ps(velecsum,velec);
1141 fscal = _mm_and_ps(fscal,cutoff_mask);
1143 fscal = _mm_andnot_ps(dummy_mask,fscal);
1145 /* Calculate temporary vectorial force */
1146 tx = _mm_mul_ps(fscal,dx10);
1147 ty = _mm_mul_ps(fscal,dy10);
1148 tz = _mm_mul_ps(fscal,dz10);
1150 /* Update vectorial force */
1151 fix1 = _mm_add_ps(fix1,tx);
1152 fiy1 = _mm_add_ps(fiy1,ty);
1153 fiz1 = _mm_add_ps(fiz1,tz);
1155 fjx0 = _mm_add_ps(fjx0,tx);
1156 fjy0 = _mm_add_ps(fjy0,ty);
1157 fjz0 = _mm_add_ps(fjz0,tz);
1161 /**************************
1162 * CALCULATE INTERACTIONS *
1163 **************************/
1165 if (gmx_mm_any_lt(rsq11,rcutoff2))
1168 r11 = _mm_mul_ps(rsq11,rinv11);
1169 r11 = _mm_andnot_ps(dummy_mask,r11);
1171 /* EWALD ELECTROSTATICS */
1173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1174 ewrt = _mm_mul_ps(r11,ewtabscale);
1175 ewitab = _mm_cvttps_epi32(ewrt);
1176 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1177 ewitab = _mm_slli_epi32(ewitab,2);
1178 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1179 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1180 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1181 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1182 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1183 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1184 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1185 velec = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1186 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1188 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1190 /* Update potential sum for this i atom from the interaction with this j atom. */
1191 velec = _mm_and_ps(velec,cutoff_mask);
1192 velec = _mm_andnot_ps(dummy_mask,velec);
1193 velecsum = _mm_add_ps(velecsum,velec);
1197 fscal = _mm_and_ps(fscal,cutoff_mask);
1199 fscal = _mm_andnot_ps(dummy_mask,fscal);
1201 /* Calculate temporary vectorial force */
1202 tx = _mm_mul_ps(fscal,dx11);
1203 ty = _mm_mul_ps(fscal,dy11);
1204 tz = _mm_mul_ps(fscal,dz11);
1206 /* Update vectorial force */
1207 fix1 = _mm_add_ps(fix1,tx);
1208 fiy1 = _mm_add_ps(fiy1,ty);
1209 fiz1 = _mm_add_ps(fiz1,tz);
1211 fjx1 = _mm_add_ps(fjx1,tx);
1212 fjy1 = _mm_add_ps(fjy1,ty);
1213 fjz1 = _mm_add_ps(fjz1,tz);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 if (gmx_mm_any_lt(rsq12,rcutoff2))
1224 r12 = _mm_mul_ps(rsq12,rinv12);
1225 r12 = _mm_andnot_ps(dummy_mask,r12);
1227 /* EWALD ELECTROSTATICS */
1229 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1230 ewrt = _mm_mul_ps(r12,ewtabscale);
1231 ewitab = _mm_cvttps_epi32(ewrt);
1232 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1233 ewitab = _mm_slli_epi32(ewitab,2);
1234 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1235 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1236 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1237 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1238 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1239 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1240 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1241 velec = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1242 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1244 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
1246 /* Update potential sum for this i atom from the interaction with this j atom. */
1247 velec = _mm_and_ps(velec,cutoff_mask);
1248 velec = _mm_andnot_ps(dummy_mask,velec);
1249 velecsum = _mm_add_ps(velecsum,velec);
1253 fscal = _mm_and_ps(fscal,cutoff_mask);
1255 fscal = _mm_andnot_ps(dummy_mask,fscal);
1257 /* Calculate temporary vectorial force */
1258 tx = _mm_mul_ps(fscal,dx12);
1259 ty = _mm_mul_ps(fscal,dy12);
1260 tz = _mm_mul_ps(fscal,dz12);
1262 /* Update vectorial force */
1263 fix1 = _mm_add_ps(fix1,tx);
1264 fiy1 = _mm_add_ps(fiy1,ty);
1265 fiz1 = _mm_add_ps(fiz1,tz);
1267 fjx2 = _mm_add_ps(fjx2,tx);
1268 fjy2 = _mm_add_ps(fjy2,ty);
1269 fjz2 = _mm_add_ps(fjz2,tz);
1273 /**************************
1274 * CALCULATE INTERACTIONS *
1275 **************************/
1277 if (gmx_mm_any_lt(rsq20,rcutoff2))
1280 r20 = _mm_mul_ps(rsq20,rinv20);
1281 r20 = _mm_andnot_ps(dummy_mask,r20);
1283 /* EWALD ELECTROSTATICS */
1285 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1286 ewrt = _mm_mul_ps(r20,ewtabscale);
1287 ewitab = _mm_cvttps_epi32(ewrt);
1288 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1289 ewitab = _mm_slli_epi32(ewitab,2);
1290 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1291 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1292 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1293 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1294 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1295 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1296 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1297 velec = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1298 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1300 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
1302 /* Update potential sum for this i atom from the interaction with this j atom. */
1303 velec = _mm_and_ps(velec,cutoff_mask);
1304 velec = _mm_andnot_ps(dummy_mask,velec);
1305 velecsum = _mm_add_ps(velecsum,velec);
1309 fscal = _mm_and_ps(fscal,cutoff_mask);
1311 fscal = _mm_andnot_ps(dummy_mask,fscal);
1313 /* Calculate temporary vectorial force */
1314 tx = _mm_mul_ps(fscal,dx20);
1315 ty = _mm_mul_ps(fscal,dy20);
1316 tz = _mm_mul_ps(fscal,dz20);
1318 /* Update vectorial force */
1319 fix2 = _mm_add_ps(fix2,tx);
1320 fiy2 = _mm_add_ps(fiy2,ty);
1321 fiz2 = _mm_add_ps(fiz2,tz);
1323 fjx0 = _mm_add_ps(fjx0,tx);
1324 fjy0 = _mm_add_ps(fjy0,ty);
1325 fjz0 = _mm_add_ps(fjz0,tz);
1329 /**************************
1330 * CALCULATE INTERACTIONS *
1331 **************************/
1333 if (gmx_mm_any_lt(rsq21,rcutoff2))
1336 r21 = _mm_mul_ps(rsq21,rinv21);
1337 r21 = _mm_andnot_ps(dummy_mask,r21);
1339 /* EWALD ELECTROSTATICS */
1341 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1342 ewrt = _mm_mul_ps(r21,ewtabscale);
1343 ewitab = _mm_cvttps_epi32(ewrt);
1344 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1345 ewitab = _mm_slli_epi32(ewitab,2);
1346 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1347 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1348 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1349 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1350 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1351 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1352 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1353 velec = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1354 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1356 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
1358 /* Update potential sum for this i atom from the interaction with this j atom. */
1359 velec = _mm_and_ps(velec,cutoff_mask);
1360 velec = _mm_andnot_ps(dummy_mask,velec);
1361 velecsum = _mm_add_ps(velecsum,velec);
1365 fscal = _mm_and_ps(fscal,cutoff_mask);
1367 fscal = _mm_andnot_ps(dummy_mask,fscal);
1369 /* Calculate temporary vectorial force */
1370 tx = _mm_mul_ps(fscal,dx21);
1371 ty = _mm_mul_ps(fscal,dy21);
1372 tz = _mm_mul_ps(fscal,dz21);
1374 /* Update vectorial force */
1375 fix2 = _mm_add_ps(fix2,tx);
1376 fiy2 = _mm_add_ps(fiy2,ty);
1377 fiz2 = _mm_add_ps(fiz2,tz);
1379 fjx1 = _mm_add_ps(fjx1,tx);
1380 fjy1 = _mm_add_ps(fjy1,ty);
1381 fjz1 = _mm_add_ps(fjz1,tz);
1385 /**************************
1386 * CALCULATE INTERACTIONS *
1387 **************************/
1389 if (gmx_mm_any_lt(rsq22,rcutoff2))
1392 r22 = _mm_mul_ps(rsq22,rinv22);
1393 r22 = _mm_andnot_ps(dummy_mask,r22);
1395 /* EWALD ELECTROSTATICS */
1397 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1398 ewrt = _mm_mul_ps(r22,ewtabscale);
1399 ewitab = _mm_cvttps_epi32(ewrt);
1400 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1401 ewitab = _mm_slli_epi32(ewitab,2);
1402 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1403 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1404 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1405 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1406 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1407 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1408 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1409 velec = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1410 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1412 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
1414 /* Update potential sum for this i atom from the interaction with this j atom. */
1415 velec = _mm_and_ps(velec,cutoff_mask);
1416 velec = _mm_andnot_ps(dummy_mask,velec);
1417 velecsum = _mm_add_ps(velecsum,velec);
1421 fscal = _mm_and_ps(fscal,cutoff_mask);
1423 fscal = _mm_andnot_ps(dummy_mask,fscal);
1425 /* Calculate temporary vectorial force */
1426 tx = _mm_mul_ps(fscal,dx22);
1427 ty = _mm_mul_ps(fscal,dy22);
1428 tz = _mm_mul_ps(fscal,dz22);
1430 /* Update vectorial force */
1431 fix2 = _mm_add_ps(fix2,tx);
1432 fiy2 = _mm_add_ps(fiy2,ty);
1433 fiz2 = _mm_add_ps(fiz2,tz);
1435 fjx2 = _mm_add_ps(fjx2,tx);
1436 fjy2 = _mm_add_ps(fjy2,ty);
1437 fjz2 = _mm_add_ps(fjz2,tz);
1441 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1442 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1443 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1444 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1446 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1447 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1449 /* Inner loop uses 459 flops */
1452 /* End of innermost loop */
1454 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1455 f+i_coord_offset,fshift+i_shift_offset);
1458 /* Update potential energies */
1459 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1460 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1462 /* Increment number of inner iterations */
1463 inneriter += j_index_end - j_index_start;
1465 /* Outer loop uses 20 flops */
1468 /* Increment number of outer iterations */
1471 /* Update outer/inner flops */
1473 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*459);
1476 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single
1477 * Electrostatics interaction: Ewald
1478 * VdW interaction: LJEwald
1479 * Geometry: Water3-Water3
1480 * Calculate force/pot: Force
1483 nb_kernel_ElecEwSh_VdwLJEwSh_GeomW3W3_F_sse2_single
1484 (t_nblist * gmx_restrict nlist,
1485 rvec * gmx_restrict xx,
1486 rvec * gmx_restrict ff,
1487 t_forcerec * gmx_restrict fr,
1488 t_mdatoms * gmx_restrict mdatoms,
1489 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1490 t_nrnb * gmx_restrict nrnb)
1492 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1493 * just 0 for non-waters.
1494 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1495 * jnr indices corresponding to data put in the four positions in the SIMD register.
1497 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1498 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1499 int jnrA,jnrB,jnrC,jnrD;
1500 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1501 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1502 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1503 real rcutoff_scalar;
1504 real *shiftvec,*fshift,*x,*f;
1505 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1506 real scratch[4*DIM];
1507 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1509 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1511 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1513 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1514 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1515 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1516 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1517 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1518 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1519 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1520 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1521 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1522 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1523 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1524 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1525 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1526 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1527 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1528 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1529 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1532 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1535 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1536 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1546 __m128 ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1548 __m128 one_half = _mm_set1_ps(0.5);
1549 __m128 minus_one = _mm_set1_ps(-1.0);
1551 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1553 __m128 dummy_mask,cutoff_mask;
1554 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1555 __m128 one = _mm_set1_ps(1.0);
1556 __m128 two = _mm_set1_ps(2.0);
1562 jindex = nlist->jindex;
1564 shiftidx = nlist->shift;
1566 shiftvec = fr->shift_vec[0];
1567 fshift = fr->fshift[0];
1568 facel = _mm_set1_ps(fr->epsfac);
1569 charge = mdatoms->chargeA;
1570 nvdwtype = fr->ntype;
1571 vdwparam = fr->nbfp;
1572 vdwtype = mdatoms->typeA;
1573 vdwgridparam = fr->ljpme_c6grid;
1574 sh_lj_ewald = _mm_set1_ps(fr->ic->sh_lj_ewald);
1575 ewclj = _mm_set1_ps(fr->ewaldcoeff_lj);
1576 ewclj2 = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1578 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1579 ewtab = fr->ic->tabq_coul_F;
1580 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1581 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1583 /* Setup water-specific parameters */
1584 inr = nlist->iinr[0];
1585 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1586 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1587 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1588 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1590 jq0 = _mm_set1_ps(charge[inr+0]);
1591 jq1 = _mm_set1_ps(charge[inr+1]);
1592 jq2 = _mm_set1_ps(charge[inr+2]);
1593 vdwjidx0A = 2*vdwtype[inr+0];
1594 qq00 = _mm_mul_ps(iq0,jq0);
1595 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1596 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1597 c6grid_00 = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1598 qq01 = _mm_mul_ps(iq0,jq1);
1599 qq02 = _mm_mul_ps(iq0,jq2);
1600 qq10 = _mm_mul_ps(iq1,jq0);
1601 qq11 = _mm_mul_ps(iq1,jq1);
1602 qq12 = _mm_mul_ps(iq1,jq2);
1603 qq20 = _mm_mul_ps(iq2,jq0);
1604 qq21 = _mm_mul_ps(iq2,jq1);
1605 qq22 = _mm_mul_ps(iq2,jq2);
1607 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1608 rcutoff_scalar = fr->rcoulomb;
1609 rcutoff = _mm_set1_ps(rcutoff_scalar);
1610 rcutoff2 = _mm_mul_ps(rcutoff,rcutoff);
1612 sh_vdw_invrcut6 = _mm_set1_ps(fr->ic->sh_invrc6);
1613 rvdw = _mm_set1_ps(fr->rvdw);
1615 /* Avoid stupid compiler warnings */
1616 jnrA = jnrB = jnrC = jnrD = 0;
1617 j_coord_offsetA = 0;
1618 j_coord_offsetB = 0;
1619 j_coord_offsetC = 0;
1620 j_coord_offsetD = 0;
1625 for(iidx=0;iidx<4*DIM;iidx++)
1627 scratch[iidx] = 0.0;
1630 /* Start outer loop over neighborlists */
1631 for(iidx=0; iidx<nri; iidx++)
1633 /* Load shift vector for this list */
1634 i_shift_offset = DIM*shiftidx[iidx];
1636 /* Load limits for loop over neighbors */
1637 j_index_start = jindex[iidx];
1638 j_index_end = jindex[iidx+1];
1640 /* Get outer coordinate index */
1642 i_coord_offset = DIM*inr;
1644 /* Load i particle coords and add shift vector */
1645 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1646 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1648 fix0 = _mm_setzero_ps();
1649 fiy0 = _mm_setzero_ps();
1650 fiz0 = _mm_setzero_ps();
1651 fix1 = _mm_setzero_ps();
1652 fiy1 = _mm_setzero_ps();
1653 fiz1 = _mm_setzero_ps();
1654 fix2 = _mm_setzero_ps();
1655 fiy2 = _mm_setzero_ps();
1656 fiz2 = _mm_setzero_ps();
1658 /* Start inner kernel loop */
1659 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1662 /* Get j neighbor index, and coordinate index */
1664 jnrB = jjnr[jidx+1];
1665 jnrC = jjnr[jidx+2];
1666 jnrD = jjnr[jidx+3];
1667 j_coord_offsetA = DIM*jnrA;
1668 j_coord_offsetB = DIM*jnrB;
1669 j_coord_offsetC = DIM*jnrC;
1670 j_coord_offsetD = DIM*jnrD;
1672 /* load j atom coordinates */
1673 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1674 x+j_coord_offsetC,x+j_coord_offsetD,
1675 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1677 /* Calculate displacement vector */
1678 dx00 = _mm_sub_ps(ix0,jx0);
1679 dy00 = _mm_sub_ps(iy0,jy0);
1680 dz00 = _mm_sub_ps(iz0,jz0);
1681 dx01 = _mm_sub_ps(ix0,jx1);
1682 dy01 = _mm_sub_ps(iy0,jy1);
1683 dz01 = _mm_sub_ps(iz0,jz1);
1684 dx02 = _mm_sub_ps(ix0,jx2);
1685 dy02 = _mm_sub_ps(iy0,jy2);
1686 dz02 = _mm_sub_ps(iz0,jz2);
1687 dx10 = _mm_sub_ps(ix1,jx0);
1688 dy10 = _mm_sub_ps(iy1,jy0);
1689 dz10 = _mm_sub_ps(iz1,jz0);
1690 dx11 = _mm_sub_ps(ix1,jx1);
1691 dy11 = _mm_sub_ps(iy1,jy1);
1692 dz11 = _mm_sub_ps(iz1,jz1);
1693 dx12 = _mm_sub_ps(ix1,jx2);
1694 dy12 = _mm_sub_ps(iy1,jy2);
1695 dz12 = _mm_sub_ps(iz1,jz2);
1696 dx20 = _mm_sub_ps(ix2,jx0);
1697 dy20 = _mm_sub_ps(iy2,jy0);
1698 dz20 = _mm_sub_ps(iz2,jz0);
1699 dx21 = _mm_sub_ps(ix2,jx1);
1700 dy21 = _mm_sub_ps(iy2,jy1);
1701 dz21 = _mm_sub_ps(iz2,jz1);
1702 dx22 = _mm_sub_ps(ix2,jx2);
1703 dy22 = _mm_sub_ps(iy2,jy2);
1704 dz22 = _mm_sub_ps(iz2,jz2);
1706 /* Calculate squared distance and things based on it */
1707 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1708 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1709 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1710 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1711 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1712 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1713 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1714 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1715 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1717 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1718 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1719 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1720 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1721 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1722 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1723 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1724 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1725 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1727 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1728 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
1729 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
1730 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
1731 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1732 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1733 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
1734 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1735 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1737 fjx0 = _mm_setzero_ps();
1738 fjy0 = _mm_setzero_ps();
1739 fjz0 = _mm_setzero_ps();
1740 fjx1 = _mm_setzero_ps();
1741 fjy1 = _mm_setzero_ps();
1742 fjz1 = _mm_setzero_ps();
1743 fjx2 = _mm_setzero_ps();
1744 fjy2 = _mm_setzero_ps();
1745 fjz2 = _mm_setzero_ps();
1747 /**************************
1748 * CALCULATE INTERACTIONS *
1749 **************************/
1751 if (gmx_mm_any_lt(rsq00,rcutoff2))
1754 r00 = _mm_mul_ps(rsq00,rinv00);
1756 /* EWALD ELECTROSTATICS */
1758 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1759 ewrt = _mm_mul_ps(r00,ewtabscale);
1760 ewitab = _mm_cvttps_epi32(ewrt);
1761 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1762 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1763 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1765 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1766 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1768 /* Analytical LJ-PME */
1769 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1770 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
1771 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1772 exponent = gmx_simd_exp_r(ewcljrsq);
1773 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1774 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
1775 /* f6A = 6 * C6grid * (1 - poly) */
1776 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1777 /* f6B = C6grid * exponent * beta^6 */
1778 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1779 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1780 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
1782 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
1784 fscal = _mm_add_ps(felec,fvdw);
1786 fscal = _mm_and_ps(fscal,cutoff_mask);
1788 /* Calculate temporary vectorial force */
1789 tx = _mm_mul_ps(fscal,dx00);
1790 ty = _mm_mul_ps(fscal,dy00);
1791 tz = _mm_mul_ps(fscal,dz00);
1793 /* Update vectorial force */
1794 fix0 = _mm_add_ps(fix0,tx);
1795 fiy0 = _mm_add_ps(fiy0,ty);
1796 fiz0 = _mm_add_ps(fiz0,tz);
1798 fjx0 = _mm_add_ps(fjx0,tx);
1799 fjy0 = _mm_add_ps(fjy0,ty);
1800 fjz0 = _mm_add_ps(fjz0,tz);
1804 /**************************
1805 * CALCULATE INTERACTIONS *
1806 **************************/
1808 if (gmx_mm_any_lt(rsq01,rcutoff2))
1811 r01 = _mm_mul_ps(rsq01,rinv01);
1813 /* EWALD ELECTROSTATICS */
1815 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1816 ewrt = _mm_mul_ps(r01,ewtabscale);
1817 ewitab = _mm_cvttps_epi32(ewrt);
1818 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1819 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1820 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1822 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1823 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1825 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
1829 fscal = _mm_and_ps(fscal,cutoff_mask);
1831 /* Calculate temporary vectorial force */
1832 tx = _mm_mul_ps(fscal,dx01);
1833 ty = _mm_mul_ps(fscal,dy01);
1834 tz = _mm_mul_ps(fscal,dz01);
1836 /* Update vectorial force */
1837 fix0 = _mm_add_ps(fix0,tx);
1838 fiy0 = _mm_add_ps(fiy0,ty);
1839 fiz0 = _mm_add_ps(fiz0,tz);
1841 fjx1 = _mm_add_ps(fjx1,tx);
1842 fjy1 = _mm_add_ps(fjy1,ty);
1843 fjz1 = _mm_add_ps(fjz1,tz);
1847 /**************************
1848 * CALCULATE INTERACTIONS *
1849 **************************/
1851 if (gmx_mm_any_lt(rsq02,rcutoff2))
1854 r02 = _mm_mul_ps(rsq02,rinv02);
1856 /* EWALD ELECTROSTATICS */
1858 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1859 ewrt = _mm_mul_ps(r02,ewtabscale);
1860 ewitab = _mm_cvttps_epi32(ewrt);
1861 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1862 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1863 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1865 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1866 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1868 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
1872 fscal = _mm_and_ps(fscal,cutoff_mask);
1874 /* Calculate temporary vectorial force */
1875 tx = _mm_mul_ps(fscal,dx02);
1876 ty = _mm_mul_ps(fscal,dy02);
1877 tz = _mm_mul_ps(fscal,dz02);
1879 /* Update vectorial force */
1880 fix0 = _mm_add_ps(fix0,tx);
1881 fiy0 = _mm_add_ps(fiy0,ty);
1882 fiz0 = _mm_add_ps(fiz0,tz);
1884 fjx2 = _mm_add_ps(fjx2,tx);
1885 fjy2 = _mm_add_ps(fjy2,ty);
1886 fjz2 = _mm_add_ps(fjz2,tz);
1890 /**************************
1891 * CALCULATE INTERACTIONS *
1892 **************************/
1894 if (gmx_mm_any_lt(rsq10,rcutoff2))
1897 r10 = _mm_mul_ps(rsq10,rinv10);
1899 /* EWALD ELECTROSTATICS */
1901 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1902 ewrt = _mm_mul_ps(r10,ewtabscale);
1903 ewitab = _mm_cvttps_epi32(ewrt);
1904 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1905 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1906 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1908 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1909 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1911 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
1915 fscal = _mm_and_ps(fscal,cutoff_mask);
1917 /* Calculate temporary vectorial force */
1918 tx = _mm_mul_ps(fscal,dx10);
1919 ty = _mm_mul_ps(fscal,dy10);
1920 tz = _mm_mul_ps(fscal,dz10);
1922 /* Update vectorial force */
1923 fix1 = _mm_add_ps(fix1,tx);
1924 fiy1 = _mm_add_ps(fiy1,ty);
1925 fiz1 = _mm_add_ps(fiz1,tz);
1927 fjx0 = _mm_add_ps(fjx0,tx);
1928 fjy0 = _mm_add_ps(fjy0,ty);
1929 fjz0 = _mm_add_ps(fjz0,tz);
1933 /**************************
1934 * CALCULATE INTERACTIONS *
1935 **************************/
1937 if (gmx_mm_any_lt(rsq11,rcutoff2))
1940 r11 = _mm_mul_ps(rsq11,rinv11);
1942 /* EWALD ELECTROSTATICS */
1944 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1945 ewrt = _mm_mul_ps(r11,ewtabscale);
1946 ewitab = _mm_cvttps_epi32(ewrt);
1947 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1948 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1949 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1951 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1952 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1954 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
1958 fscal = _mm_and_ps(fscal,cutoff_mask);
1960 /* Calculate temporary vectorial force */
1961 tx = _mm_mul_ps(fscal,dx11);
1962 ty = _mm_mul_ps(fscal,dy11);
1963 tz = _mm_mul_ps(fscal,dz11);
1965 /* Update vectorial force */
1966 fix1 = _mm_add_ps(fix1,tx);
1967 fiy1 = _mm_add_ps(fiy1,ty);
1968 fiz1 = _mm_add_ps(fiz1,tz);
1970 fjx1 = _mm_add_ps(fjx1,tx);
1971 fjy1 = _mm_add_ps(fjy1,ty);
1972 fjz1 = _mm_add_ps(fjz1,tz);
1976 /**************************
1977 * CALCULATE INTERACTIONS *
1978 **************************/
1980 if (gmx_mm_any_lt(rsq12,rcutoff2))
1983 r12 = _mm_mul_ps(rsq12,rinv12);
1985 /* EWALD ELECTROSTATICS */
1987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988 ewrt = _mm_mul_ps(r12,ewtabscale);
1989 ewitab = _mm_cvttps_epi32(ewrt);
1990 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1991 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1992 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1994 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1995 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1997 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2001 fscal = _mm_and_ps(fscal,cutoff_mask);
2003 /* Calculate temporary vectorial force */
2004 tx = _mm_mul_ps(fscal,dx12);
2005 ty = _mm_mul_ps(fscal,dy12);
2006 tz = _mm_mul_ps(fscal,dz12);
2008 /* Update vectorial force */
2009 fix1 = _mm_add_ps(fix1,tx);
2010 fiy1 = _mm_add_ps(fiy1,ty);
2011 fiz1 = _mm_add_ps(fiz1,tz);
2013 fjx2 = _mm_add_ps(fjx2,tx);
2014 fjy2 = _mm_add_ps(fjy2,ty);
2015 fjz2 = _mm_add_ps(fjz2,tz);
2019 /**************************
2020 * CALCULATE INTERACTIONS *
2021 **************************/
2023 if (gmx_mm_any_lt(rsq20,rcutoff2))
2026 r20 = _mm_mul_ps(rsq20,rinv20);
2028 /* EWALD ELECTROSTATICS */
2030 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2031 ewrt = _mm_mul_ps(r20,ewtabscale);
2032 ewitab = _mm_cvttps_epi32(ewrt);
2033 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2034 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2035 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2037 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2038 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2040 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2044 fscal = _mm_and_ps(fscal,cutoff_mask);
2046 /* Calculate temporary vectorial force */
2047 tx = _mm_mul_ps(fscal,dx20);
2048 ty = _mm_mul_ps(fscal,dy20);
2049 tz = _mm_mul_ps(fscal,dz20);
2051 /* Update vectorial force */
2052 fix2 = _mm_add_ps(fix2,tx);
2053 fiy2 = _mm_add_ps(fiy2,ty);
2054 fiz2 = _mm_add_ps(fiz2,tz);
2056 fjx0 = _mm_add_ps(fjx0,tx);
2057 fjy0 = _mm_add_ps(fjy0,ty);
2058 fjz0 = _mm_add_ps(fjz0,tz);
2062 /**************************
2063 * CALCULATE INTERACTIONS *
2064 **************************/
2066 if (gmx_mm_any_lt(rsq21,rcutoff2))
2069 r21 = _mm_mul_ps(rsq21,rinv21);
2071 /* EWALD ELECTROSTATICS */
2073 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2074 ewrt = _mm_mul_ps(r21,ewtabscale);
2075 ewitab = _mm_cvttps_epi32(ewrt);
2076 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2077 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2078 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2080 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2081 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2083 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2087 fscal = _mm_and_ps(fscal,cutoff_mask);
2089 /* Calculate temporary vectorial force */
2090 tx = _mm_mul_ps(fscal,dx21);
2091 ty = _mm_mul_ps(fscal,dy21);
2092 tz = _mm_mul_ps(fscal,dz21);
2094 /* Update vectorial force */
2095 fix2 = _mm_add_ps(fix2,tx);
2096 fiy2 = _mm_add_ps(fiy2,ty);
2097 fiz2 = _mm_add_ps(fiz2,tz);
2099 fjx1 = _mm_add_ps(fjx1,tx);
2100 fjy1 = _mm_add_ps(fjy1,ty);
2101 fjz1 = _mm_add_ps(fjz1,tz);
2105 /**************************
2106 * CALCULATE INTERACTIONS *
2107 **************************/
2109 if (gmx_mm_any_lt(rsq22,rcutoff2))
2112 r22 = _mm_mul_ps(rsq22,rinv22);
2114 /* EWALD ELECTROSTATICS */
2116 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2117 ewrt = _mm_mul_ps(r22,ewtabscale);
2118 ewitab = _mm_cvttps_epi32(ewrt);
2119 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2120 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2121 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2123 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2124 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2126 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2130 fscal = _mm_and_ps(fscal,cutoff_mask);
2132 /* Calculate temporary vectorial force */
2133 tx = _mm_mul_ps(fscal,dx22);
2134 ty = _mm_mul_ps(fscal,dy22);
2135 tz = _mm_mul_ps(fscal,dz22);
2137 /* Update vectorial force */
2138 fix2 = _mm_add_ps(fix2,tx);
2139 fiy2 = _mm_add_ps(fiy2,ty);
2140 fiz2 = _mm_add_ps(fiz2,tz);
2142 fjx2 = _mm_add_ps(fjx2,tx);
2143 fjy2 = _mm_add_ps(fjy2,ty);
2144 fjz2 = _mm_add_ps(fjz2,tz);
2148 fjptrA = f+j_coord_offsetA;
2149 fjptrB = f+j_coord_offsetB;
2150 fjptrC = f+j_coord_offsetC;
2151 fjptrD = f+j_coord_offsetD;
2153 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2154 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2156 /* Inner loop uses 374 flops */
2159 if(jidx<j_index_end)
2162 /* Get j neighbor index, and coordinate index */
2163 jnrlistA = jjnr[jidx];
2164 jnrlistB = jjnr[jidx+1];
2165 jnrlistC = jjnr[jidx+2];
2166 jnrlistD = jjnr[jidx+3];
2167 /* Sign of each element will be negative for non-real atoms.
2168 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2169 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2171 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2172 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2173 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2174 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2175 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2176 j_coord_offsetA = DIM*jnrA;
2177 j_coord_offsetB = DIM*jnrB;
2178 j_coord_offsetC = DIM*jnrC;
2179 j_coord_offsetD = DIM*jnrD;
2181 /* load j atom coordinates */
2182 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2183 x+j_coord_offsetC,x+j_coord_offsetD,
2184 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2186 /* Calculate displacement vector */
2187 dx00 = _mm_sub_ps(ix0,jx0);
2188 dy00 = _mm_sub_ps(iy0,jy0);
2189 dz00 = _mm_sub_ps(iz0,jz0);
2190 dx01 = _mm_sub_ps(ix0,jx1);
2191 dy01 = _mm_sub_ps(iy0,jy1);
2192 dz01 = _mm_sub_ps(iz0,jz1);
2193 dx02 = _mm_sub_ps(ix0,jx2);
2194 dy02 = _mm_sub_ps(iy0,jy2);
2195 dz02 = _mm_sub_ps(iz0,jz2);
2196 dx10 = _mm_sub_ps(ix1,jx0);
2197 dy10 = _mm_sub_ps(iy1,jy0);
2198 dz10 = _mm_sub_ps(iz1,jz0);
2199 dx11 = _mm_sub_ps(ix1,jx1);
2200 dy11 = _mm_sub_ps(iy1,jy1);
2201 dz11 = _mm_sub_ps(iz1,jz1);
2202 dx12 = _mm_sub_ps(ix1,jx2);
2203 dy12 = _mm_sub_ps(iy1,jy2);
2204 dz12 = _mm_sub_ps(iz1,jz2);
2205 dx20 = _mm_sub_ps(ix2,jx0);
2206 dy20 = _mm_sub_ps(iy2,jy0);
2207 dz20 = _mm_sub_ps(iz2,jz0);
2208 dx21 = _mm_sub_ps(ix2,jx1);
2209 dy21 = _mm_sub_ps(iy2,jy1);
2210 dz21 = _mm_sub_ps(iz2,jz1);
2211 dx22 = _mm_sub_ps(ix2,jx2);
2212 dy22 = _mm_sub_ps(iy2,jy2);
2213 dz22 = _mm_sub_ps(iz2,jz2);
2215 /* Calculate squared distance and things based on it */
2216 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2217 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2218 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2219 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2220 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2221 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2222 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2223 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2224 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2226 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2227 rinv01 = gmx_mm_invsqrt_ps(rsq01);
2228 rinv02 = gmx_mm_invsqrt_ps(rsq02);
2229 rinv10 = gmx_mm_invsqrt_ps(rsq10);
2230 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2231 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2232 rinv20 = gmx_mm_invsqrt_ps(rsq20);
2233 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2234 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2236 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
2237 rinvsq01 = _mm_mul_ps(rinv01,rinv01);
2238 rinvsq02 = _mm_mul_ps(rinv02,rinv02);
2239 rinvsq10 = _mm_mul_ps(rinv10,rinv10);
2240 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2241 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2242 rinvsq20 = _mm_mul_ps(rinv20,rinv20);
2243 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2244 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2246 fjx0 = _mm_setzero_ps();
2247 fjy0 = _mm_setzero_ps();
2248 fjz0 = _mm_setzero_ps();
2249 fjx1 = _mm_setzero_ps();
2250 fjy1 = _mm_setzero_ps();
2251 fjz1 = _mm_setzero_ps();
2252 fjx2 = _mm_setzero_ps();
2253 fjy2 = _mm_setzero_ps();
2254 fjz2 = _mm_setzero_ps();
2256 /**************************
2257 * CALCULATE INTERACTIONS *
2258 **************************/
2260 if (gmx_mm_any_lt(rsq00,rcutoff2))
2263 r00 = _mm_mul_ps(rsq00,rinv00);
2264 r00 = _mm_andnot_ps(dummy_mask,r00);
2266 /* EWALD ELECTROSTATICS */
2268 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2269 ewrt = _mm_mul_ps(r00,ewtabscale);
2270 ewitab = _mm_cvttps_epi32(ewrt);
2271 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2272 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2273 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2275 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2276 felec = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2278 /* Analytical LJ-PME */
2279 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2280 ewcljrsq = _mm_mul_ps(ewclj2,rsq00);
2281 ewclj6 = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
2282 exponent = gmx_simd_exp_r(ewcljrsq);
2283 /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
2284 poly = _mm_mul_ps(exponent,_mm_add_ps(_mm_sub_ps(one,ewcljrsq),_mm_mul_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half)));
2285 /* f6A = 6 * C6grid * (1 - poly) */
2286 f6A = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
2287 /* f6B = C6grid * exponent * beta^6 */
2288 f6B = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
2289 /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
2290 fvdw = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),_mm_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
2292 cutoff_mask = _mm_cmplt_ps(rsq00,rcutoff2);
2294 fscal = _mm_add_ps(felec,fvdw);
2296 fscal = _mm_and_ps(fscal,cutoff_mask);
2298 fscal = _mm_andnot_ps(dummy_mask,fscal);
2300 /* Calculate temporary vectorial force */
2301 tx = _mm_mul_ps(fscal,dx00);
2302 ty = _mm_mul_ps(fscal,dy00);
2303 tz = _mm_mul_ps(fscal,dz00);
2305 /* Update vectorial force */
2306 fix0 = _mm_add_ps(fix0,tx);
2307 fiy0 = _mm_add_ps(fiy0,ty);
2308 fiz0 = _mm_add_ps(fiz0,tz);
2310 fjx0 = _mm_add_ps(fjx0,tx);
2311 fjy0 = _mm_add_ps(fjy0,ty);
2312 fjz0 = _mm_add_ps(fjz0,tz);
2316 /**************************
2317 * CALCULATE INTERACTIONS *
2318 **************************/
2320 if (gmx_mm_any_lt(rsq01,rcutoff2))
2323 r01 = _mm_mul_ps(rsq01,rinv01);
2324 r01 = _mm_andnot_ps(dummy_mask,r01);
2326 /* EWALD ELECTROSTATICS */
2328 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2329 ewrt = _mm_mul_ps(r01,ewtabscale);
2330 ewitab = _mm_cvttps_epi32(ewrt);
2331 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2332 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2333 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2335 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2336 felec = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2338 cutoff_mask = _mm_cmplt_ps(rsq01,rcutoff2);
2342 fscal = _mm_and_ps(fscal,cutoff_mask);
2344 fscal = _mm_andnot_ps(dummy_mask,fscal);
2346 /* Calculate temporary vectorial force */
2347 tx = _mm_mul_ps(fscal,dx01);
2348 ty = _mm_mul_ps(fscal,dy01);
2349 tz = _mm_mul_ps(fscal,dz01);
2351 /* Update vectorial force */
2352 fix0 = _mm_add_ps(fix0,tx);
2353 fiy0 = _mm_add_ps(fiy0,ty);
2354 fiz0 = _mm_add_ps(fiz0,tz);
2356 fjx1 = _mm_add_ps(fjx1,tx);
2357 fjy1 = _mm_add_ps(fjy1,ty);
2358 fjz1 = _mm_add_ps(fjz1,tz);
2362 /**************************
2363 * CALCULATE INTERACTIONS *
2364 **************************/
2366 if (gmx_mm_any_lt(rsq02,rcutoff2))
2369 r02 = _mm_mul_ps(rsq02,rinv02);
2370 r02 = _mm_andnot_ps(dummy_mask,r02);
2372 /* EWALD ELECTROSTATICS */
2374 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2375 ewrt = _mm_mul_ps(r02,ewtabscale);
2376 ewitab = _mm_cvttps_epi32(ewrt);
2377 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2378 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2379 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2381 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2382 felec = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2384 cutoff_mask = _mm_cmplt_ps(rsq02,rcutoff2);
2388 fscal = _mm_and_ps(fscal,cutoff_mask);
2390 fscal = _mm_andnot_ps(dummy_mask,fscal);
2392 /* Calculate temporary vectorial force */
2393 tx = _mm_mul_ps(fscal,dx02);
2394 ty = _mm_mul_ps(fscal,dy02);
2395 tz = _mm_mul_ps(fscal,dz02);
2397 /* Update vectorial force */
2398 fix0 = _mm_add_ps(fix0,tx);
2399 fiy0 = _mm_add_ps(fiy0,ty);
2400 fiz0 = _mm_add_ps(fiz0,tz);
2402 fjx2 = _mm_add_ps(fjx2,tx);
2403 fjy2 = _mm_add_ps(fjy2,ty);
2404 fjz2 = _mm_add_ps(fjz2,tz);
2408 /**************************
2409 * CALCULATE INTERACTIONS *
2410 **************************/
2412 if (gmx_mm_any_lt(rsq10,rcutoff2))
2415 r10 = _mm_mul_ps(rsq10,rinv10);
2416 r10 = _mm_andnot_ps(dummy_mask,r10);
2418 /* EWALD ELECTROSTATICS */
2420 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2421 ewrt = _mm_mul_ps(r10,ewtabscale);
2422 ewitab = _mm_cvttps_epi32(ewrt);
2423 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2424 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2425 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2427 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2428 felec = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2430 cutoff_mask = _mm_cmplt_ps(rsq10,rcutoff2);
2434 fscal = _mm_and_ps(fscal,cutoff_mask);
2436 fscal = _mm_andnot_ps(dummy_mask,fscal);
2438 /* Calculate temporary vectorial force */
2439 tx = _mm_mul_ps(fscal,dx10);
2440 ty = _mm_mul_ps(fscal,dy10);
2441 tz = _mm_mul_ps(fscal,dz10);
2443 /* Update vectorial force */
2444 fix1 = _mm_add_ps(fix1,tx);
2445 fiy1 = _mm_add_ps(fiy1,ty);
2446 fiz1 = _mm_add_ps(fiz1,tz);
2448 fjx0 = _mm_add_ps(fjx0,tx);
2449 fjy0 = _mm_add_ps(fjy0,ty);
2450 fjz0 = _mm_add_ps(fjz0,tz);
2454 /**************************
2455 * CALCULATE INTERACTIONS *
2456 **************************/
2458 if (gmx_mm_any_lt(rsq11,rcutoff2))
2461 r11 = _mm_mul_ps(rsq11,rinv11);
2462 r11 = _mm_andnot_ps(dummy_mask,r11);
2464 /* EWALD ELECTROSTATICS */
2466 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2467 ewrt = _mm_mul_ps(r11,ewtabscale);
2468 ewitab = _mm_cvttps_epi32(ewrt);
2469 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2470 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2471 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2473 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2474 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2476 cutoff_mask = _mm_cmplt_ps(rsq11,rcutoff2);
2480 fscal = _mm_and_ps(fscal,cutoff_mask);
2482 fscal = _mm_andnot_ps(dummy_mask,fscal);
2484 /* Calculate temporary vectorial force */
2485 tx = _mm_mul_ps(fscal,dx11);
2486 ty = _mm_mul_ps(fscal,dy11);
2487 tz = _mm_mul_ps(fscal,dz11);
2489 /* Update vectorial force */
2490 fix1 = _mm_add_ps(fix1,tx);
2491 fiy1 = _mm_add_ps(fiy1,ty);
2492 fiz1 = _mm_add_ps(fiz1,tz);
2494 fjx1 = _mm_add_ps(fjx1,tx);
2495 fjy1 = _mm_add_ps(fjy1,ty);
2496 fjz1 = _mm_add_ps(fjz1,tz);
2500 /**************************
2501 * CALCULATE INTERACTIONS *
2502 **************************/
2504 if (gmx_mm_any_lt(rsq12,rcutoff2))
2507 r12 = _mm_mul_ps(rsq12,rinv12);
2508 r12 = _mm_andnot_ps(dummy_mask,r12);
2510 /* EWALD ELECTROSTATICS */
2512 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2513 ewrt = _mm_mul_ps(r12,ewtabscale);
2514 ewitab = _mm_cvttps_epi32(ewrt);
2515 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2516 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2517 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2519 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2520 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2522 cutoff_mask = _mm_cmplt_ps(rsq12,rcutoff2);
2526 fscal = _mm_and_ps(fscal,cutoff_mask);
2528 fscal = _mm_andnot_ps(dummy_mask,fscal);
2530 /* Calculate temporary vectorial force */
2531 tx = _mm_mul_ps(fscal,dx12);
2532 ty = _mm_mul_ps(fscal,dy12);
2533 tz = _mm_mul_ps(fscal,dz12);
2535 /* Update vectorial force */
2536 fix1 = _mm_add_ps(fix1,tx);
2537 fiy1 = _mm_add_ps(fiy1,ty);
2538 fiz1 = _mm_add_ps(fiz1,tz);
2540 fjx2 = _mm_add_ps(fjx2,tx);
2541 fjy2 = _mm_add_ps(fjy2,ty);
2542 fjz2 = _mm_add_ps(fjz2,tz);
2546 /**************************
2547 * CALCULATE INTERACTIONS *
2548 **************************/
2550 if (gmx_mm_any_lt(rsq20,rcutoff2))
2553 r20 = _mm_mul_ps(rsq20,rinv20);
2554 r20 = _mm_andnot_ps(dummy_mask,r20);
2556 /* EWALD ELECTROSTATICS */
2558 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2559 ewrt = _mm_mul_ps(r20,ewtabscale);
2560 ewitab = _mm_cvttps_epi32(ewrt);
2561 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2562 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2563 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2565 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2566 felec = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2568 cutoff_mask = _mm_cmplt_ps(rsq20,rcutoff2);
2572 fscal = _mm_and_ps(fscal,cutoff_mask);
2574 fscal = _mm_andnot_ps(dummy_mask,fscal);
2576 /* Calculate temporary vectorial force */
2577 tx = _mm_mul_ps(fscal,dx20);
2578 ty = _mm_mul_ps(fscal,dy20);
2579 tz = _mm_mul_ps(fscal,dz20);
2581 /* Update vectorial force */
2582 fix2 = _mm_add_ps(fix2,tx);
2583 fiy2 = _mm_add_ps(fiy2,ty);
2584 fiz2 = _mm_add_ps(fiz2,tz);
2586 fjx0 = _mm_add_ps(fjx0,tx);
2587 fjy0 = _mm_add_ps(fjy0,ty);
2588 fjz0 = _mm_add_ps(fjz0,tz);
2592 /**************************
2593 * CALCULATE INTERACTIONS *
2594 **************************/
2596 if (gmx_mm_any_lt(rsq21,rcutoff2))
2599 r21 = _mm_mul_ps(rsq21,rinv21);
2600 r21 = _mm_andnot_ps(dummy_mask,r21);
2602 /* EWALD ELECTROSTATICS */
2604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2605 ewrt = _mm_mul_ps(r21,ewtabscale);
2606 ewitab = _mm_cvttps_epi32(ewrt);
2607 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2608 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2609 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2611 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2612 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2614 cutoff_mask = _mm_cmplt_ps(rsq21,rcutoff2);
2618 fscal = _mm_and_ps(fscal,cutoff_mask);
2620 fscal = _mm_andnot_ps(dummy_mask,fscal);
2622 /* Calculate temporary vectorial force */
2623 tx = _mm_mul_ps(fscal,dx21);
2624 ty = _mm_mul_ps(fscal,dy21);
2625 tz = _mm_mul_ps(fscal,dz21);
2627 /* Update vectorial force */
2628 fix2 = _mm_add_ps(fix2,tx);
2629 fiy2 = _mm_add_ps(fiy2,ty);
2630 fiz2 = _mm_add_ps(fiz2,tz);
2632 fjx1 = _mm_add_ps(fjx1,tx);
2633 fjy1 = _mm_add_ps(fjy1,ty);
2634 fjz1 = _mm_add_ps(fjz1,tz);
2638 /**************************
2639 * CALCULATE INTERACTIONS *
2640 **************************/
2642 if (gmx_mm_any_lt(rsq22,rcutoff2))
2645 r22 = _mm_mul_ps(rsq22,rinv22);
2646 r22 = _mm_andnot_ps(dummy_mask,r22);
2648 /* EWALD ELECTROSTATICS */
2650 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2651 ewrt = _mm_mul_ps(r22,ewtabscale);
2652 ewitab = _mm_cvttps_epi32(ewrt);
2653 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2654 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2655 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2657 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2658 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2660 cutoff_mask = _mm_cmplt_ps(rsq22,rcutoff2);
2664 fscal = _mm_and_ps(fscal,cutoff_mask);
2666 fscal = _mm_andnot_ps(dummy_mask,fscal);
2668 /* Calculate temporary vectorial force */
2669 tx = _mm_mul_ps(fscal,dx22);
2670 ty = _mm_mul_ps(fscal,dy22);
2671 tz = _mm_mul_ps(fscal,dz22);
2673 /* Update vectorial force */
2674 fix2 = _mm_add_ps(fix2,tx);
2675 fiy2 = _mm_add_ps(fiy2,ty);
2676 fiz2 = _mm_add_ps(fiz2,tz);
2678 fjx2 = _mm_add_ps(fjx2,tx);
2679 fjy2 = _mm_add_ps(fjy2,ty);
2680 fjz2 = _mm_add_ps(fjz2,tz);
2684 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2685 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2686 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2687 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2689 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2690 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2692 /* Inner loop uses 383 flops */
2695 /* End of innermost loop */
2697 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2698 f+i_coord_offset,fshift+i_shift_offset);
2700 /* Increment number of inner iterations */
2701 inneriter += j_index_end - j_index_start;
2703 /* Outer loop uses 18 flops */
2706 /* Increment number of outer iterations */
2709 /* Update outer/inner flops */
2711 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*383);