2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: Ewald
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
105 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
107 __m128 dummy_mask,cutoff_mask;
108 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109 __m128 one = _mm_set1_ps(1.0);
110 __m128 two = _mm_set1_ps(2.0);
116 jindex = nlist->jindex;
118 shiftidx = nlist->shift;
120 shiftvec = fr->shift_vec[0];
121 fshift = fr->fshift[0];
122 facel = _mm_set1_ps(fr->epsfac);
123 charge = mdatoms->chargeA;
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 vftab = kernel_data->table_vdw->data;
129 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
131 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
132 ewtab = fr->ic->tabq_coul_FDV0;
133 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
134 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
139 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
140 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
141 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
143 jq1 = _mm_set1_ps(charge[inr+1]);
144 jq2 = _mm_set1_ps(charge[inr+2]);
145 jq3 = _mm_set1_ps(charge[inr+3]);
146 vdwjidx0A = 2*vdwtype[inr+0];
147 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
148 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
149 qq11 = _mm_mul_ps(iq1,jq1);
150 qq12 = _mm_mul_ps(iq1,jq2);
151 qq13 = _mm_mul_ps(iq1,jq3);
152 qq21 = _mm_mul_ps(iq2,jq1);
153 qq22 = _mm_mul_ps(iq2,jq2);
154 qq23 = _mm_mul_ps(iq2,jq3);
155 qq31 = _mm_mul_ps(iq3,jq1);
156 qq32 = _mm_mul_ps(iq3,jq2);
157 qq33 = _mm_mul_ps(iq3,jq3);
159 /* Avoid stupid compiler warnings */
160 jnrA = jnrB = jnrC = jnrD = 0;
169 /* Start outer loop over neighborlists */
170 for(iidx=0; iidx<nri; iidx++)
172 /* Load shift vector for this list */
173 i_shift_offset = DIM*shiftidx[iidx];
174 shX = shiftvec[i_shift_offset+XX];
175 shY = shiftvec[i_shift_offset+YY];
176 shZ = shiftvec[i_shift_offset+ZZ];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
188 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
189 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
190 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
191 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
192 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
193 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
194 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
195 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
196 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
197 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
198 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
200 fix0 = _mm_setzero_ps();
201 fiy0 = _mm_setzero_ps();
202 fiz0 = _mm_setzero_ps();
203 fix1 = _mm_setzero_ps();
204 fiy1 = _mm_setzero_ps();
205 fiz1 = _mm_setzero_ps();
206 fix2 = _mm_setzero_ps();
207 fiy2 = _mm_setzero_ps();
208 fiz2 = _mm_setzero_ps();
209 fix3 = _mm_setzero_ps();
210 fiy3 = _mm_setzero_ps();
211 fiz3 = _mm_setzero_ps();
213 /* Reset potential sums */
214 velecsum = _mm_setzero_ps();
215 vvdwsum = _mm_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
221 /* Get j neighbor index, and coordinate index */
227 j_coord_offsetA = DIM*jnrA;
228 j_coord_offsetB = DIM*jnrB;
229 j_coord_offsetC = DIM*jnrC;
230 j_coord_offsetD = DIM*jnrD;
232 /* load j atom coordinates */
233 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234 x+j_coord_offsetC,x+j_coord_offsetD,
235 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
236 &jy2,&jz2,&jx3,&jy3,&jz3);
238 /* Calculate displacement vector */
239 dx00 = _mm_sub_ps(ix0,jx0);
240 dy00 = _mm_sub_ps(iy0,jy0);
241 dz00 = _mm_sub_ps(iz0,jz0);
242 dx11 = _mm_sub_ps(ix1,jx1);
243 dy11 = _mm_sub_ps(iy1,jy1);
244 dz11 = _mm_sub_ps(iz1,jz1);
245 dx12 = _mm_sub_ps(ix1,jx2);
246 dy12 = _mm_sub_ps(iy1,jy2);
247 dz12 = _mm_sub_ps(iz1,jz2);
248 dx13 = _mm_sub_ps(ix1,jx3);
249 dy13 = _mm_sub_ps(iy1,jy3);
250 dz13 = _mm_sub_ps(iz1,jz3);
251 dx21 = _mm_sub_ps(ix2,jx1);
252 dy21 = _mm_sub_ps(iy2,jy1);
253 dz21 = _mm_sub_ps(iz2,jz1);
254 dx22 = _mm_sub_ps(ix2,jx2);
255 dy22 = _mm_sub_ps(iy2,jy2);
256 dz22 = _mm_sub_ps(iz2,jz2);
257 dx23 = _mm_sub_ps(ix2,jx3);
258 dy23 = _mm_sub_ps(iy2,jy3);
259 dz23 = _mm_sub_ps(iz2,jz3);
260 dx31 = _mm_sub_ps(ix3,jx1);
261 dy31 = _mm_sub_ps(iy3,jy1);
262 dz31 = _mm_sub_ps(iz3,jz1);
263 dx32 = _mm_sub_ps(ix3,jx2);
264 dy32 = _mm_sub_ps(iy3,jy2);
265 dz32 = _mm_sub_ps(iz3,jz2);
266 dx33 = _mm_sub_ps(ix3,jx3);
267 dy33 = _mm_sub_ps(iy3,jy3);
268 dz33 = _mm_sub_ps(iz3,jz3);
270 /* Calculate squared distance and things based on it */
271 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
272 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
275 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
278 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
279 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
280 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
282 rinv00 = gmx_mm_invsqrt_ps(rsq00);
283 rinv11 = gmx_mm_invsqrt_ps(rsq11);
284 rinv12 = gmx_mm_invsqrt_ps(rsq12);
285 rinv13 = gmx_mm_invsqrt_ps(rsq13);
286 rinv21 = gmx_mm_invsqrt_ps(rsq21);
287 rinv22 = gmx_mm_invsqrt_ps(rsq22);
288 rinv23 = gmx_mm_invsqrt_ps(rsq23);
289 rinv31 = gmx_mm_invsqrt_ps(rsq31);
290 rinv32 = gmx_mm_invsqrt_ps(rsq32);
291 rinv33 = gmx_mm_invsqrt_ps(rsq33);
293 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
294 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
295 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
296 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
297 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
298 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
299 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
300 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
301 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
303 fjx0 = _mm_setzero_ps();
304 fjy0 = _mm_setzero_ps();
305 fjz0 = _mm_setzero_ps();
306 fjx1 = _mm_setzero_ps();
307 fjy1 = _mm_setzero_ps();
308 fjz1 = _mm_setzero_ps();
309 fjx2 = _mm_setzero_ps();
310 fjy2 = _mm_setzero_ps();
311 fjz2 = _mm_setzero_ps();
312 fjx3 = _mm_setzero_ps();
313 fjy3 = _mm_setzero_ps();
314 fjz3 = _mm_setzero_ps();
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
320 r00 = _mm_mul_ps(rsq00,rinv00);
322 /* Calculate table index by multiplying r with table scale and truncate to integer */
323 rt = _mm_mul_ps(r00,vftabscale);
324 vfitab = _mm_cvttps_epi32(rt);
325 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
326 vfitab = _mm_slli_epi32(vfitab,3);
328 /* CUBIC SPLINE TABLE DISPERSION */
329 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
330 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
331 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
332 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
333 _MM_TRANSPOSE4_PS(Y,F,G,H);
334 Heps = _mm_mul_ps(vfeps,H);
335 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
336 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
337 vvdw6 = _mm_mul_ps(c6_00,VV);
338 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
339 fvdw6 = _mm_mul_ps(c6_00,FF);
341 /* CUBIC SPLINE TABLE REPULSION */
342 vfitab = _mm_add_epi32(vfitab,ifour);
343 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
344 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
345 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
346 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
347 _MM_TRANSPOSE4_PS(Y,F,G,H);
348 Heps = _mm_mul_ps(vfeps,H);
349 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
350 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
351 vvdw12 = _mm_mul_ps(c12_00,VV);
352 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
353 fvdw12 = _mm_mul_ps(c12_00,FF);
354 vvdw = _mm_add_ps(vvdw12,vvdw6);
355 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
362 /* Calculate temporary vectorial force */
363 tx = _mm_mul_ps(fscal,dx00);
364 ty = _mm_mul_ps(fscal,dy00);
365 tz = _mm_mul_ps(fscal,dz00);
367 /* Update vectorial force */
368 fix0 = _mm_add_ps(fix0,tx);
369 fiy0 = _mm_add_ps(fiy0,ty);
370 fiz0 = _mm_add_ps(fiz0,tz);
372 fjx0 = _mm_add_ps(fjx0,tx);
373 fjy0 = _mm_add_ps(fjy0,ty);
374 fjz0 = _mm_add_ps(fjz0,tz);
376 /**************************
377 * CALCULATE INTERACTIONS *
378 **************************/
380 r11 = _mm_mul_ps(rsq11,rinv11);
382 /* EWALD ELECTROSTATICS */
384 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
385 ewrt = _mm_mul_ps(r11,ewtabscale);
386 ewitab = _mm_cvttps_epi32(ewrt);
387 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
388 ewitab = _mm_slli_epi32(ewitab,2);
389 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
390 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
391 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
392 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
393 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
394 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
395 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
396 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
397 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_ps(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_ps(fscal,dx11);
406 ty = _mm_mul_ps(fscal,dy11);
407 tz = _mm_mul_ps(fscal,dz11);
409 /* Update vectorial force */
410 fix1 = _mm_add_ps(fix1,tx);
411 fiy1 = _mm_add_ps(fiy1,ty);
412 fiz1 = _mm_add_ps(fiz1,tz);
414 fjx1 = _mm_add_ps(fjx1,tx);
415 fjy1 = _mm_add_ps(fjy1,ty);
416 fjz1 = _mm_add_ps(fjz1,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r12 = _mm_mul_ps(rsq12,rinv12);
424 /* EWALD ELECTROSTATICS */
426 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
427 ewrt = _mm_mul_ps(r12,ewtabscale);
428 ewitab = _mm_cvttps_epi32(ewrt);
429 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
430 ewitab = _mm_slli_epi32(ewitab,2);
431 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
432 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
433 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
434 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
435 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
436 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
437 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
438 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
439 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm_add_ps(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm_mul_ps(fscal,dx12);
448 ty = _mm_mul_ps(fscal,dy12);
449 tz = _mm_mul_ps(fscal,dz12);
451 /* Update vectorial force */
452 fix1 = _mm_add_ps(fix1,tx);
453 fiy1 = _mm_add_ps(fiy1,ty);
454 fiz1 = _mm_add_ps(fiz1,tz);
456 fjx2 = _mm_add_ps(fjx2,tx);
457 fjy2 = _mm_add_ps(fjy2,ty);
458 fjz2 = _mm_add_ps(fjz2,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r13 = _mm_mul_ps(rsq13,rinv13);
466 /* EWALD ELECTROSTATICS */
468 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
469 ewrt = _mm_mul_ps(r13,ewtabscale);
470 ewitab = _mm_cvttps_epi32(ewrt);
471 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
472 ewitab = _mm_slli_epi32(ewitab,2);
473 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
474 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
475 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
476 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
477 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
478 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
479 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
480 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
481 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
483 /* Update potential sum for this i atom from the interaction with this j atom. */
484 velecsum = _mm_add_ps(velecsum,velec);
488 /* Calculate temporary vectorial force */
489 tx = _mm_mul_ps(fscal,dx13);
490 ty = _mm_mul_ps(fscal,dy13);
491 tz = _mm_mul_ps(fscal,dz13);
493 /* Update vectorial force */
494 fix1 = _mm_add_ps(fix1,tx);
495 fiy1 = _mm_add_ps(fiy1,ty);
496 fiz1 = _mm_add_ps(fiz1,tz);
498 fjx3 = _mm_add_ps(fjx3,tx);
499 fjy3 = _mm_add_ps(fjy3,ty);
500 fjz3 = _mm_add_ps(fjz3,tz);
502 /**************************
503 * CALCULATE INTERACTIONS *
504 **************************/
506 r21 = _mm_mul_ps(rsq21,rinv21);
508 /* EWALD ELECTROSTATICS */
510 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
511 ewrt = _mm_mul_ps(r21,ewtabscale);
512 ewitab = _mm_cvttps_epi32(ewrt);
513 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
514 ewitab = _mm_slli_epi32(ewitab,2);
515 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
516 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
517 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
518 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
519 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
520 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
521 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
522 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
523 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm_add_ps(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm_mul_ps(fscal,dx21);
532 ty = _mm_mul_ps(fscal,dy21);
533 tz = _mm_mul_ps(fscal,dz21);
535 /* Update vectorial force */
536 fix2 = _mm_add_ps(fix2,tx);
537 fiy2 = _mm_add_ps(fiy2,ty);
538 fiz2 = _mm_add_ps(fiz2,tz);
540 fjx1 = _mm_add_ps(fjx1,tx);
541 fjy1 = _mm_add_ps(fjy1,ty);
542 fjz1 = _mm_add_ps(fjz1,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r22 = _mm_mul_ps(rsq22,rinv22);
550 /* EWALD ELECTROSTATICS */
552 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
553 ewrt = _mm_mul_ps(r22,ewtabscale);
554 ewitab = _mm_cvttps_epi32(ewrt);
555 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
556 ewitab = _mm_slli_epi32(ewitab,2);
557 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
558 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
559 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
560 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
561 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
562 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
563 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
564 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
565 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
567 /* Update potential sum for this i atom from the interaction with this j atom. */
568 velecsum = _mm_add_ps(velecsum,velec);
572 /* Calculate temporary vectorial force */
573 tx = _mm_mul_ps(fscal,dx22);
574 ty = _mm_mul_ps(fscal,dy22);
575 tz = _mm_mul_ps(fscal,dz22);
577 /* Update vectorial force */
578 fix2 = _mm_add_ps(fix2,tx);
579 fiy2 = _mm_add_ps(fiy2,ty);
580 fiz2 = _mm_add_ps(fiz2,tz);
582 fjx2 = _mm_add_ps(fjx2,tx);
583 fjy2 = _mm_add_ps(fjy2,ty);
584 fjz2 = _mm_add_ps(fjz2,tz);
586 /**************************
587 * CALCULATE INTERACTIONS *
588 **************************/
590 r23 = _mm_mul_ps(rsq23,rinv23);
592 /* EWALD ELECTROSTATICS */
594 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
595 ewrt = _mm_mul_ps(r23,ewtabscale);
596 ewitab = _mm_cvttps_epi32(ewrt);
597 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
598 ewitab = _mm_slli_epi32(ewitab,2);
599 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
600 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
601 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
602 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
603 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
604 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
605 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
606 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
607 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
609 /* Update potential sum for this i atom from the interaction with this j atom. */
610 velecsum = _mm_add_ps(velecsum,velec);
614 /* Calculate temporary vectorial force */
615 tx = _mm_mul_ps(fscal,dx23);
616 ty = _mm_mul_ps(fscal,dy23);
617 tz = _mm_mul_ps(fscal,dz23);
619 /* Update vectorial force */
620 fix2 = _mm_add_ps(fix2,tx);
621 fiy2 = _mm_add_ps(fiy2,ty);
622 fiz2 = _mm_add_ps(fiz2,tz);
624 fjx3 = _mm_add_ps(fjx3,tx);
625 fjy3 = _mm_add_ps(fjy3,ty);
626 fjz3 = _mm_add_ps(fjz3,tz);
628 /**************************
629 * CALCULATE INTERACTIONS *
630 **************************/
632 r31 = _mm_mul_ps(rsq31,rinv31);
634 /* EWALD ELECTROSTATICS */
636 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
637 ewrt = _mm_mul_ps(r31,ewtabscale);
638 ewitab = _mm_cvttps_epi32(ewrt);
639 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
640 ewitab = _mm_slli_epi32(ewitab,2);
641 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
642 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
643 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
644 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
645 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
646 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
647 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
648 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
649 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
651 /* Update potential sum for this i atom from the interaction with this j atom. */
652 velecsum = _mm_add_ps(velecsum,velec);
656 /* Calculate temporary vectorial force */
657 tx = _mm_mul_ps(fscal,dx31);
658 ty = _mm_mul_ps(fscal,dy31);
659 tz = _mm_mul_ps(fscal,dz31);
661 /* Update vectorial force */
662 fix3 = _mm_add_ps(fix3,tx);
663 fiy3 = _mm_add_ps(fiy3,ty);
664 fiz3 = _mm_add_ps(fiz3,tz);
666 fjx1 = _mm_add_ps(fjx1,tx);
667 fjy1 = _mm_add_ps(fjy1,ty);
668 fjz1 = _mm_add_ps(fjz1,tz);
670 /**************************
671 * CALCULATE INTERACTIONS *
672 **************************/
674 r32 = _mm_mul_ps(rsq32,rinv32);
676 /* EWALD ELECTROSTATICS */
678 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
679 ewrt = _mm_mul_ps(r32,ewtabscale);
680 ewitab = _mm_cvttps_epi32(ewrt);
681 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
682 ewitab = _mm_slli_epi32(ewitab,2);
683 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
684 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
685 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
686 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
687 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
688 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
689 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
690 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
691 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum = _mm_add_ps(velecsum,velec);
698 /* Calculate temporary vectorial force */
699 tx = _mm_mul_ps(fscal,dx32);
700 ty = _mm_mul_ps(fscal,dy32);
701 tz = _mm_mul_ps(fscal,dz32);
703 /* Update vectorial force */
704 fix3 = _mm_add_ps(fix3,tx);
705 fiy3 = _mm_add_ps(fiy3,ty);
706 fiz3 = _mm_add_ps(fiz3,tz);
708 fjx2 = _mm_add_ps(fjx2,tx);
709 fjy2 = _mm_add_ps(fjy2,ty);
710 fjz2 = _mm_add_ps(fjz2,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 r33 = _mm_mul_ps(rsq33,rinv33);
718 /* EWALD ELECTROSTATICS */
720 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
721 ewrt = _mm_mul_ps(r33,ewtabscale);
722 ewitab = _mm_cvttps_epi32(ewrt);
723 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
724 ewitab = _mm_slli_epi32(ewitab,2);
725 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
726 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
727 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
728 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
729 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
730 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
731 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
732 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
733 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
735 /* Update potential sum for this i atom from the interaction with this j atom. */
736 velecsum = _mm_add_ps(velecsum,velec);
740 /* Calculate temporary vectorial force */
741 tx = _mm_mul_ps(fscal,dx33);
742 ty = _mm_mul_ps(fscal,dy33);
743 tz = _mm_mul_ps(fscal,dz33);
745 /* Update vectorial force */
746 fix3 = _mm_add_ps(fix3,tx);
747 fiy3 = _mm_add_ps(fiy3,ty);
748 fiz3 = _mm_add_ps(fiz3,tz);
750 fjx3 = _mm_add_ps(fjx3,tx);
751 fjy3 = _mm_add_ps(fjy3,ty);
752 fjz3 = _mm_add_ps(fjz3,tz);
754 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
755 f+j_coord_offsetC,f+j_coord_offsetD,
756 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
757 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
759 /* Inner loop uses 428 flops */
765 /* Get j neighbor index, and coordinate index */
771 /* Sign of each element will be negative for non-real atoms.
772 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
773 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
775 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
776 jnrA = (jnrA>=0) ? jnrA : 0;
777 jnrB = (jnrB>=0) ? jnrB : 0;
778 jnrC = (jnrC>=0) ? jnrC : 0;
779 jnrD = (jnrD>=0) ? jnrD : 0;
781 j_coord_offsetA = DIM*jnrA;
782 j_coord_offsetB = DIM*jnrB;
783 j_coord_offsetC = DIM*jnrC;
784 j_coord_offsetD = DIM*jnrD;
786 /* load j atom coordinates */
787 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
788 x+j_coord_offsetC,x+j_coord_offsetD,
789 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
790 &jy2,&jz2,&jx3,&jy3,&jz3);
792 /* Calculate displacement vector */
793 dx00 = _mm_sub_ps(ix0,jx0);
794 dy00 = _mm_sub_ps(iy0,jy0);
795 dz00 = _mm_sub_ps(iz0,jz0);
796 dx11 = _mm_sub_ps(ix1,jx1);
797 dy11 = _mm_sub_ps(iy1,jy1);
798 dz11 = _mm_sub_ps(iz1,jz1);
799 dx12 = _mm_sub_ps(ix1,jx2);
800 dy12 = _mm_sub_ps(iy1,jy2);
801 dz12 = _mm_sub_ps(iz1,jz2);
802 dx13 = _mm_sub_ps(ix1,jx3);
803 dy13 = _mm_sub_ps(iy1,jy3);
804 dz13 = _mm_sub_ps(iz1,jz3);
805 dx21 = _mm_sub_ps(ix2,jx1);
806 dy21 = _mm_sub_ps(iy2,jy1);
807 dz21 = _mm_sub_ps(iz2,jz1);
808 dx22 = _mm_sub_ps(ix2,jx2);
809 dy22 = _mm_sub_ps(iy2,jy2);
810 dz22 = _mm_sub_ps(iz2,jz2);
811 dx23 = _mm_sub_ps(ix2,jx3);
812 dy23 = _mm_sub_ps(iy2,jy3);
813 dz23 = _mm_sub_ps(iz2,jz3);
814 dx31 = _mm_sub_ps(ix3,jx1);
815 dy31 = _mm_sub_ps(iy3,jy1);
816 dz31 = _mm_sub_ps(iz3,jz1);
817 dx32 = _mm_sub_ps(ix3,jx2);
818 dy32 = _mm_sub_ps(iy3,jy2);
819 dz32 = _mm_sub_ps(iz3,jz2);
820 dx33 = _mm_sub_ps(ix3,jx3);
821 dy33 = _mm_sub_ps(iy3,jy3);
822 dz33 = _mm_sub_ps(iz3,jz3);
824 /* Calculate squared distance and things based on it */
825 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
826 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
827 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
828 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
829 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
830 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
831 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
832 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
833 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
834 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
836 rinv00 = gmx_mm_invsqrt_ps(rsq00);
837 rinv11 = gmx_mm_invsqrt_ps(rsq11);
838 rinv12 = gmx_mm_invsqrt_ps(rsq12);
839 rinv13 = gmx_mm_invsqrt_ps(rsq13);
840 rinv21 = gmx_mm_invsqrt_ps(rsq21);
841 rinv22 = gmx_mm_invsqrt_ps(rsq22);
842 rinv23 = gmx_mm_invsqrt_ps(rsq23);
843 rinv31 = gmx_mm_invsqrt_ps(rsq31);
844 rinv32 = gmx_mm_invsqrt_ps(rsq32);
845 rinv33 = gmx_mm_invsqrt_ps(rsq33);
847 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
848 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
849 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
850 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
851 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
852 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
853 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
854 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
855 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
857 fjx0 = _mm_setzero_ps();
858 fjy0 = _mm_setzero_ps();
859 fjz0 = _mm_setzero_ps();
860 fjx1 = _mm_setzero_ps();
861 fjy1 = _mm_setzero_ps();
862 fjz1 = _mm_setzero_ps();
863 fjx2 = _mm_setzero_ps();
864 fjy2 = _mm_setzero_ps();
865 fjz2 = _mm_setzero_ps();
866 fjx3 = _mm_setzero_ps();
867 fjy3 = _mm_setzero_ps();
868 fjz3 = _mm_setzero_ps();
870 /**************************
871 * CALCULATE INTERACTIONS *
872 **************************/
874 r00 = _mm_mul_ps(rsq00,rinv00);
875 r00 = _mm_andnot_ps(dummy_mask,r00);
877 /* Calculate table index by multiplying r with table scale and truncate to integer */
878 rt = _mm_mul_ps(r00,vftabscale);
879 vfitab = _mm_cvttps_epi32(rt);
880 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
881 vfitab = _mm_slli_epi32(vfitab,3);
883 /* CUBIC SPLINE TABLE DISPERSION */
884 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
885 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
886 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
887 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
888 _MM_TRANSPOSE4_PS(Y,F,G,H);
889 Heps = _mm_mul_ps(vfeps,H);
890 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
891 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
892 vvdw6 = _mm_mul_ps(c6_00,VV);
893 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
894 fvdw6 = _mm_mul_ps(c6_00,FF);
896 /* CUBIC SPLINE TABLE REPULSION */
897 vfitab = _mm_add_epi32(vfitab,ifour);
898 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
899 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
900 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
901 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
902 _MM_TRANSPOSE4_PS(Y,F,G,H);
903 Heps = _mm_mul_ps(vfeps,H);
904 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
905 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
906 vvdw12 = _mm_mul_ps(c12_00,VV);
907 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
908 fvdw12 = _mm_mul_ps(c12_00,FF);
909 vvdw = _mm_add_ps(vvdw12,vvdw6);
910 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
912 /* Update potential sum for this i atom from the interaction with this j atom. */
913 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
914 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
918 fscal = _mm_andnot_ps(dummy_mask,fscal);
920 /* Calculate temporary vectorial force */
921 tx = _mm_mul_ps(fscal,dx00);
922 ty = _mm_mul_ps(fscal,dy00);
923 tz = _mm_mul_ps(fscal,dz00);
925 /* Update vectorial force */
926 fix0 = _mm_add_ps(fix0,tx);
927 fiy0 = _mm_add_ps(fiy0,ty);
928 fiz0 = _mm_add_ps(fiz0,tz);
930 fjx0 = _mm_add_ps(fjx0,tx);
931 fjy0 = _mm_add_ps(fjy0,ty);
932 fjz0 = _mm_add_ps(fjz0,tz);
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r11 = _mm_mul_ps(rsq11,rinv11);
939 r11 = _mm_andnot_ps(dummy_mask,r11);
941 /* EWALD ELECTROSTATICS */
943 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
944 ewrt = _mm_mul_ps(r11,ewtabscale);
945 ewitab = _mm_cvttps_epi32(ewrt);
946 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
947 ewitab = _mm_slli_epi32(ewitab,2);
948 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
949 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
950 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
951 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
952 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
953 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
954 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
955 velec = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
956 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
958 /* Update potential sum for this i atom from the interaction with this j atom. */
959 velec = _mm_andnot_ps(dummy_mask,velec);
960 velecsum = _mm_add_ps(velecsum,velec);
964 fscal = _mm_andnot_ps(dummy_mask,fscal);
966 /* Calculate temporary vectorial force */
967 tx = _mm_mul_ps(fscal,dx11);
968 ty = _mm_mul_ps(fscal,dy11);
969 tz = _mm_mul_ps(fscal,dz11);
971 /* Update vectorial force */
972 fix1 = _mm_add_ps(fix1,tx);
973 fiy1 = _mm_add_ps(fiy1,ty);
974 fiz1 = _mm_add_ps(fiz1,tz);
976 fjx1 = _mm_add_ps(fjx1,tx);
977 fjy1 = _mm_add_ps(fjy1,ty);
978 fjz1 = _mm_add_ps(fjz1,tz);
980 /**************************
981 * CALCULATE INTERACTIONS *
982 **************************/
984 r12 = _mm_mul_ps(rsq12,rinv12);
985 r12 = _mm_andnot_ps(dummy_mask,r12);
987 /* EWALD ELECTROSTATICS */
989 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
990 ewrt = _mm_mul_ps(r12,ewtabscale);
991 ewitab = _mm_cvttps_epi32(ewrt);
992 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
993 ewitab = _mm_slli_epi32(ewitab,2);
994 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
995 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
996 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
997 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
998 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
999 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1000 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1001 velec = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1002 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1004 /* Update potential sum for this i atom from the interaction with this j atom. */
1005 velec = _mm_andnot_ps(dummy_mask,velec);
1006 velecsum = _mm_add_ps(velecsum,velec);
1010 fscal = _mm_andnot_ps(dummy_mask,fscal);
1012 /* Calculate temporary vectorial force */
1013 tx = _mm_mul_ps(fscal,dx12);
1014 ty = _mm_mul_ps(fscal,dy12);
1015 tz = _mm_mul_ps(fscal,dz12);
1017 /* Update vectorial force */
1018 fix1 = _mm_add_ps(fix1,tx);
1019 fiy1 = _mm_add_ps(fiy1,ty);
1020 fiz1 = _mm_add_ps(fiz1,tz);
1022 fjx2 = _mm_add_ps(fjx2,tx);
1023 fjy2 = _mm_add_ps(fjy2,ty);
1024 fjz2 = _mm_add_ps(fjz2,tz);
1026 /**************************
1027 * CALCULATE INTERACTIONS *
1028 **************************/
1030 r13 = _mm_mul_ps(rsq13,rinv13);
1031 r13 = _mm_andnot_ps(dummy_mask,r13);
1033 /* EWALD ELECTROSTATICS */
1035 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1036 ewrt = _mm_mul_ps(r13,ewtabscale);
1037 ewitab = _mm_cvttps_epi32(ewrt);
1038 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1039 ewitab = _mm_slli_epi32(ewitab,2);
1040 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1041 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1042 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1043 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1044 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1045 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1046 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1047 velec = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1048 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1050 /* Update potential sum for this i atom from the interaction with this j atom. */
1051 velec = _mm_andnot_ps(dummy_mask,velec);
1052 velecsum = _mm_add_ps(velecsum,velec);
1056 fscal = _mm_andnot_ps(dummy_mask,fscal);
1058 /* Calculate temporary vectorial force */
1059 tx = _mm_mul_ps(fscal,dx13);
1060 ty = _mm_mul_ps(fscal,dy13);
1061 tz = _mm_mul_ps(fscal,dz13);
1063 /* Update vectorial force */
1064 fix1 = _mm_add_ps(fix1,tx);
1065 fiy1 = _mm_add_ps(fiy1,ty);
1066 fiz1 = _mm_add_ps(fiz1,tz);
1068 fjx3 = _mm_add_ps(fjx3,tx);
1069 fjy3 = _mm_add_ps(fjy3,ty);
1070 fjz3 = _mm_add_ps(fjz3,tz);
1072 /**************************
1073 * CALCULATE INTERACTIONS *
1074 **************************/
1076 r21 = _mm_mul_ps(rsq21,rinv21);
1077 r21 = _mm_andnot_ps(dummy_mask,r21);
1079 /* EWALD ELECTROSTATICS */
1081 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1082 ewrt = _mm_mul_ps(r21,ewtabscale);
1083 ewitab = _mm_cvttps_epi32(ewrt);
1084 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1085 ewitab = _mm_slli_epi32(ewitab,2);
1086 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1087 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1088 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1089 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1090 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1091 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1092 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1093 velec = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1094 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1096 /* Update potential sum for this i atom from the interaction with this j atom. */
1097 velec = _mm_andnot_ps(dummy_mask,velec);
1098 velecsum = _mm_add_ps(velecsum,velec);
1102 fscal = _mm_andnot_ps(dummy_mask,fscal);
1104 /* Calculate temporary vectorial force */
1105 tx = _mm_mul_ps(fscal,dx21);
1106 ty = _mm_mul_ps(fscal,dy21);
1107 tz = _mm_mul_ps(fscal,dz21);
1109 /* Update vectorial force */
1110 fix2 = _mm_add_ps(fix2,tx);
1111 fiy2 = _mm_add_ps(fiy2,ty);
1112 fiz2 = _mm_add_ps(fiz2,tz);
1114 fjx1 = _mm_add_ps(fjx1,tx);
1115 fjy1 = _mm_add_ps(fjy1,ty);
1116 fjz1 = _mm_add_ps(fjz1,tz);
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 r22 = _mm_mul_ps(rsq22,rinv22);
1123 r22 = _mm_andnot_ps(dummy_mask,r22);
1125 /* EWALD ELECTROSTATICS */
1127 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1128 ewrt = _mm_mul_ps(r22,ewtabscale);
1129 ewitab = _mm_cvttps_epi32(ewrt);
1130 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1131 ewitab = _mm_slli_epi32(ewitab,2);
1132 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1133 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1134 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1135 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1136 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1137 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1138 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1139 velec = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1140 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1142 /* Update potential sum for this i atom from the interaction with this j atom. */
1143 velec = _mm_andnot_ps(dummy_mask,velec);
1144 velecsum = _mm_add_ps(velecsum,velec);
1148 fscal = _mm_andnot_ps(dummy_mask,fscal);
1150 /* Calculate temporary vectorial force */
1151 tx = _mm_mul_ps(fscal,dx22);
1152 ty = _mm_mul_ps(fscal,dy22);
1153 tz = _mm_mul_ps(fscal,dz22);
1155 /* Update vectorial force */
1156 fix2 = _mm_add_ps(fix2,tx);
1157 fiy2 = _mm_add_ps(fiy2,ty);
1158 fiz2 = _mm_add_ps(fiz2,tz);
1160 fjx2 = _mm_add_ps(fjx2,tx);
1161 fjy2 = _mm_add_ps(fjy2,ty);
1162 fjz2 = _mm_add_ps(fjz2,tz);
1164 /**************************
1165 * CALCULATE INTERACTIONS *
1166 **************************/
1168 r23 = _mm_mul_ps(rsq23,rinv23);
1169 r23 = _mm_andnot_ps(dummy_mask,r23);
1171 /* EWALD ELECTROSTATICS */
1173 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1174 ewrt = _mm_mul_ps(r23,ewtabscale);
1175 ewitab = _mm_cvttps_epi32(ewrt);
1176 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1177 ewitab = _mm_slli_epi32(ewitab,2);
1178 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1179 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1180 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1181 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1182 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1183 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1184 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1185 velec = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1186 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1188 /* Update potential sum for this i atom from the interaction with this j atom. */
1189 velec = _mm_andnot_ps(dummy_mask,velec);
1190 velecsum = _mm_add_ps(velecsum,velec);
1194 fscal = _mm_andnot_ps(dummy_mask,fscal);
1196 /* Calculate temporary vectorial force */
1197 tx = _mm_mul_ps(fscal,dx23);
1198 ty = _mm_mul_ps(fscal,dy23);
1199 tz = _mm_mul_ps(fscal,dz23);
1201 /* Update vectorial force */
1202 fix2 = _mm_add_ps(fix2,tx);
1203 fiy2 = _mm_add_ps(fiy2,ty);
1204 fiz2 = _mm_add_ps(fiz2,tz);
1206 fjx3 = _mm_add_ps(fjx3,tx);
1207 fjy3 = _mm_add_ps(fjy3,ty);
1208 fjz3 = _mm_add_ps(fjz3,tz);
1210 /**************************
1211 * CALCULATE INTERACTIONS *
1212 **************************/
1214 r31 = _mm_mul_ps(rsq31,rinv31);
1215 r31 = _mm_andnot_ps(dummy_mask,r31);
1217 /* EWALD ELECTROSTATICS */
1219 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1220 ewrt = _mm_mul_ps(r31,ewtabscale);
1221 ewitab = _mm_cvttps_epi32(ewrt);
1222 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1223 ewitab = _mm_slli_epi32(ewitab,2);
1224 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1225 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1226 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1227 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1228 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1229 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1230 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1231 velec = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1232 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1234 /* Update potential sum for this i atom from the interaction with this j atom. */
1235 velec = _mm_andnot_ps(dummy_mask,velec);
1236 velecsum = _mm_add_ps(velecsum,velec);
1240 fscal = _mm_andnot_ps(dummy_mask,fscal);
1242 /* Calculate temporary vectorial force */
1243 tx = _mm_mul_ps(fscal,dx31);
1244 ty = _mm_mul_ps(fscal,dy31);
1245 tz = _mm_mul_ps(fscal,dz31);
1247 /* Update vectorial force */
1248 fix3 = _mm_add_ps(fix3,tx);
1249 fiy3 = _mm_add_ps(fiy3,ty);
1250 fiz3 = _mm_add_ps(fiz3,tz);
1252 fjx1 = _mm_add_ps(fjx1,tx);
1253 fjy1 = _mm_add_ps(fjy1,ty);
1254 fjz1 = _mm_add_ps(fjz1,tz);
1256 /**************************
1257 * CALCULATE INTERACTIONS *
1258 **************************/
1260 r32 = _mm_mul_ps(rsq32,rinv32);
1261 r32 = _mm_andnot_ps(dummy_mask,r32);
1263 /* EWALD ELECTROSTATICS */
1265 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1266 ewrt = _mm_mul_ps(r32,ewtabscale);
1267 ewitab = _mm_cvttps_epi32(ewrt);
1268 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1269 ewitab = _mm_slli_epi32(ewitab,2);
1270 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1271 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1272 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1273 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1274 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1275 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1276 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1277 velec = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1278 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1280 /* Update potential sum for this i atom from the interaction with this j atom. */
1281 velec = _mm_andnot_ps(dummy_mask,velec);
1282 velecsum = _mm_add_ps(velecsum,velec);
1286 fscal = _mm_andnot_ps(dummy_mask,fscal);
1288 /* Calculate temporary vectorial force */
1289 tx = _mm_mul_ps(fscal,dx32);
1290 ty = _mm_mul_ps(fscal,dy32);
1291 tz = _mm_mul_ps(fscal,dz32);
1293 /* Update vectorial force */
1294 fix3 = _mm_add_ps(fix3,tx);
1295 fiy3 = _mm_add_ps(fiy3,ty);
1296 fiz3 = _mm_add_ps(fiz3,tz);
1298 fjx2 = _mm_add_ps(fjx2,tx);
1299 fjy2 = _mm_add_ps(fjy2,ty);
1300 fjz2 = _mm_add_ps(fjz2,tz);
1302 /**************************
1303 * CALCULATE INTERACTIONS *
1304 **************************/
1306 r33 = _mm_mul_ps(rsq33,rinv33);
1307 r33 = _mm_andnot_ps(dummy_mask,r33);
1309 /* EWALD ELECTROSTATICS */
1311 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1312 ewrt = _mm_mul_ps(r33,ewtabscale);
1313 ewitab = _mm_cvttps_epi32(ewrt);
1314 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1315 ewitab = _mm_slli_epi32(ewitab,2);
1316 ewtabF = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1317 ewtabD = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1318 ewtabV = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1319 ewtabFn = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1320 _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1321 felec = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1322 velec = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1323 velec = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1324 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1326 /* Update potential sum for this i atom from the interaction with this j atom. */
1327 velec = _mm_andnot_ps(dummy_mask,velec);
1328 velecsum = _mm_add_ps(velecsum,velec);
1332 fscal = _mm_andnot_ps(dummy_mask,fscal);
1334 /* Calculate temporary vectorial force */
1335 tx = _mm_mul_ps(fscal,dx33);
1336 ty = _mm_mul_ps(fscal,dy33);
1337 tz = _mm_mul_ps(fscal,dz33);
1339 /* Update vectorial force */
1340 fix3 = _mm_add_ps(fix3,tx);
1341 fiy3 = _mm_add_ps(fiy3,ty);
1342 fiz3 = _mm_add_ps(fiz3,tz);
1344 fjx3 = _mm_add_ps(fjx3,tx);
1345 fjy3 = _mm_add_ps(fjy3,ty);
1346 fjz3 = _mm_add_ps(fjz3,tz);
1348 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1349 f+j_coord_offsetC,f+j_coord_offsetD,
1350 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1351 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1353 /* Inner loop uses 438 flops */
1356 /* End of innermost loop */
1358 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1359 f+i_coord_offset,fshift+i_shift_offset);
1362 /* Update potential energies */
1363 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1364 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1366 /* Increment number of inner iterations */
1367 inneriter += j_index_end - j_index_start;
1369 /* Outer loop uses 38 flops */
1372 /* Increment number of outer iterations */
1375 /* Update outer/inner flops */
1377 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*438);
1380 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
1381 * Electrostatics interaction: Ewald
1382 * VdW interaction: CubicSplineTable
1383 * Geometry: Water4-Water4
1384 * Calculate force/pot: Force
1387 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_sse2_single
1388 (t_nblist * gmx_restrict nlist,
1389 rvec * gmx_restrict xx,
1390 rvec * gmx_restrict ff,
1391 t_forcerec * gmx_restrict fr,
1392 t_mdatoms * gmx_restrict mdatoms,
1393 nb_kernel_data_t * gmx_restrict kernel_data,
1394 t_nrnb * gmx_restrict nrnb)
1396 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1397 * just 0 for non-waters.
1398 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1399 * jnr indices corresponding to data put in the four positions in the SIMD register.
1401 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1402 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1403 int jnrA,jnrB,jnrC,jnrD;
1404 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1405 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1406 real shX,shY,shZ,rcutoff_scalar;
1407 real *shiftvec,*fshift,*x,*f;
1408 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1410 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1412 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1414 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1416 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1417 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1418 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1419 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1420 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1421 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1422 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1423 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1424 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1425 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1426 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1427 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1428 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1429 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1430 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1431 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1432 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1433 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1434 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1435 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1438 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1441 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1442 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1444 __m128i ifour = _mm_set1_epi32(4);
1445 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1448 __m128 ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1450 __m128 dummy_mask,cutoff_mask;
1451 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1452 __m128 one = _mm_set1_ps(1.0);
1453 __m128 two = _mm_set1_ps(2.0);
1459 jindex = nlist->jindex;
1461 shiftidx = nlist->shift;
1463 shiftvec = fr->shift_vec[0];
1464 fshift = fr->fshift[0];
1465 facel = _mm_set1_ps(fr->epsfac);
1466 charge = mdatoms->chargeA;
1467 nvdwtype = fr->ntype;
1468 vdwparam = fr->nbfp;
1469 vdwtype = mdatoms->typeA;
1471 vftab = kernel_data->table_vdw->data;
1472 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1474 sh_ewald = _mm_set1_ps(fr->ic->sh_ewald);
1475 ewtab = fr->ic->tabq_coul_F;
1476 ewtabscale = _mm_set1_ps(fr->ic->tabq_scale);
1477 ewtabhalfspace = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1479 /* Setup water-specific parameters */
1480 inr = nlist->iinr[0];
1481 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1482 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1483 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1484 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1486 jq1 = _mm_set1_ps(charge[inr+1]);
1487 jq2 = _mm_set1_ps(charge[inr+2]);
1488 jq3 = _mm_set1_ps(charge[inr+3]);
1489 vdwjidx0A = 2*vdwtype[inr+0];
1490 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1491 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1492 qq11 = _mm_mul_ps(iq1,jq1);
1493 qq12 = _mm_mul_ps(iq1,jq2);
1494 qq13 = _mm_mul_ps(iq1,jq3);
1495 qq21 = _mm_mul_ps(iq2,jq1);
1496 qq22 = _mm_mul_ps(iq2,jq2);
1497 qq23 = _mm_mul_ps(iq2,jq3);
1498 qq31 = _mm_mul_ps(iq3,jq1);
1499 qq32 = _mm_mul_ps(iq3,jq2);
1500 qq33 = _mm_mul_ps(iq3,jq3);
1502 /* Avoid stupid compiler warnings */
1503 jnrA = jnrB = jnrC = jnrD = 0;
1504 j_coord_offsetA = 0;
1505 j_coord_offsetB = 0;
1506 j_coord_offsetC = 0;
1507 j_coord_offsetD = 0;
1512 /* Start outer loop over neighborlists */
1513 for(iidx=0; iidx<nri; iidx++)
1515 /* Load shift vector for this list */
1516 i_shift_offset = DIM*shiftidx[iidx];
1517 shX = shiftvec[i_shift_offset+XX];
1518 shY = shiftvec[i_shift_offset+YY];
1519 shZ = shiftvec[i_shift_offset+ZZ];
1521 /* Load limits for loop over neighbors */
1522 j_index_start = jindex[iidx];
1523 j_index_end = jindex[iidx+1];
1525 /* Get outer coordinate index */
1527 i_coord_offset = DIM*inr;
1529 /* Load i particle coords and add shift vector */
1530 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1531 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1532 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1533 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1534 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1535 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1536 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1537 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1538 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1539 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1540 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1541 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1543 fix0 = _mm_setzero_ps();
1544 fiy0 = _mm_setzero_ps();
1545 fiz0 = _mm_setzero_ps();
1546 fix1 = _mm_setzero_ps();
1547 fiy1 = _mm_setzero_ps();
1548 fiz1 = _mm_setzero_ps();
1549 fix2 = _mm_setzero_ps();
1550 fiy2 = _mm_setzero_ps();
1551 fiz2 = _mm_setzero_ps();
1552 fix3 = _mm_setzero_ps();
1553 fiy3 = _mm_setzero_ps();
1554 fiz3 = _mm_setzero_ps();
1556 /* Start inner kernel loop */
1557 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1560 /* Get j neighbor index, and coordinate index */
1562 jnrB = jjnr[jidx+1];
1563 jnrC = jjnr[jidx+2];
1564 jnrD = jjnr[jidx+3];
1566 j_coord_offsetA = DIM*jnrA;
1567 j_coord_offsetB = DIM*jnrB;
1568 j_coord_offsetC = DIM*jnrC;
1569 j_coord_offsetD = DIM*jnrD;
1571 /* load j atom coordinates */
1572 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1573 x+j_coord_offsetC,x+j_coord_offsetD,
1574 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1575 &jy2,&jz2,&jx3,&jy3,&jz3);
1577 /* Calculate displacement vector */
1578 dx00 = _mm_sub_ps(ix0,jx0);
1579 dy00 = _mm_sub_ps(iy0,jy0);
1580 dz00 = _mm_sub_ps(iz0,jz0);
1581 dx11 = _mm_sub_ps(ix1,jx1);
1582 dy11 = _mm_sub_ps(iy1,jy1);
1583 dz11 = _mm_sub_ps(iz1,jz1);
1584 dx12 = _mm_sub_ps(ix1,jx2);
1585 dy12 = _mm_sub_ps(iy1,jy2);
1586 dz12 = _mm_sub_ps(iz1,jz2);
1587 dx13 = _mm_sub_ps(ix1,jx3);
1588 dy13 = _mm_sub_ps(iy1,jy3);
1589 dz13 = _mm_sub_ps(iz1,jz3);
1590 dx21 = _mm_sub_ps(ix2,jx1);
1591 dy21 = _mm_sub_ps(iy2,jy1);
1592 dz21 = _mm_sub_ps(iz2,jz1);
1593 dx22 = _mm_sub_ps(ix2,jx2);
1594 dy22 = _mm_sub_ps(iy2,jy2);
1595 dz22 = _mm_sub_ps(iz2,jz2);
1596 dx23 = _mm_sub_ps(ix2,jx3);
1597 dy23 = _mm_sub_ps(iy2,jy3);
1598 dz23 = _mm_sub_ps(iz2,jz3);
1599 dx31 = _mm_sub_ps(ix3,jx1);
1600 dy31 = _mm_sub_ps(iy3,jy1);
1601 dz31 = _mm_sub_ps(iz3,jz1);
1602 dx32 = _mm_sub_ps(ix3,jx2);
1603 dy32 = _mm_sub_ps(iy3,jy2);
1604 dz32 = _mm_sub_ps(iz3,jz2);
1605 dx33 = _mm_sub_ps(ix3,jx3);
1606 dy33 = _mm_sub_ps(iy3,jy3);
1607 dz33 = _mm_sub_ps(iz3,jz3);
1609 /* Calculate squared distance and things based on it */
1610 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1611 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1612 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1613 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1614 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1615 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1616 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1617 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1618 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1619 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1621 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1622 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1623 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1624 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1625 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1626 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1627 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1628 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1629 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1630 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1632 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1633 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1634 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1635 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1636 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1637 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1638 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1639 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1640 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1642 fjx0 = _mm_setzero_ps();
1643 fjy0 = _mm_setzero_ps();
1644 fjz0 = _mm_setzero_ps();
1645 fjx1 = _mm_setzero_ps();
1646 fjy1 = _mm_setzero_ps();
1647 fjz1 = _mm_setzero_ps();
1648 fjx2 = _mm_setzero_ps();
1649 fjy2 = _mm_setzero_ps();
1650 fjz2 = _mm_setzero_ps();
1651 fjx3 = _mm_setzero_ps();
1652 fjy3 = _mm_setzero_ps();
1653 fjz3 = _mm_setzero_ps();
1655 /**************************
1656 * CALCULATE INTERACTIONS *
1657 **************************/
1659 r00 = _mm_mul_ps(rsq00,rinv00);
1661 /* Calculate table index by multiplying r with table scale and truncate to integer */
1662 rt = _mm_mul_ps(r00,vftabscale);
1663 vfitab = _mm_cvttps_epi32(rt);
1664 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1665 vfitab = _mm_slli_epi32(vfitab,3);
1667 /* CUBIC SPLINE TABLE DISPERSION */
1668 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1669 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1670 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1671 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1672 _MM_TRANSPOSE4_PS(Y,F,G,H);
1673 Heps = _mm_mul_ps(vfeps,H);
1674 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1675 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1676 fvdw6 = _mm_mul_ps(c6_00,FF);
1678 /* CUBIC SPLINE TABLE REPULSION */
1679 vfitab = _mm_add_epi32(vfitab,ifour);
1680 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1681 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1682 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1683 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1684 _MM_TRANSPOSE4_PS(Y,F,G,H);
1685 Heps = _mm_mul_ps(vfeps,H);
1686 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1687 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1688 fvdw12 = _mm_mul_ps(c12_00,FF);
1689 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1693 /* Calculate temporary vectorial force */
1694 tx = _mm_mul_ps(fscal,dx00);
1695 ty = _mm_mul_ps(fscal,dy00);
1696 tz = _mm_mul_ps(fscal,dz00);
1698 /* Update vectorial force */
1699 fix0 = _mm_add_ps(fix0,tx);
1700 fiy0 = _mm_add_ps(fiy0,ty);
1701 fiz0 = _mm_add_ps(fiz0,tz);
1703 fjx0 = _mm_add_ps(fjx0,tx);
1704 fjy0 = _mm_add_ps(fjy0,ty);
1705 fjz0 = _mm_add_ps(fjz0,tz);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 r11 = _mm_mul_ps(rsq11,rinv11);
1713 /* EWALD ELECTROSTATICS */
1715 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1716 ewrt = _mm_mul_ps(r11,ewtabscale);
1717 ewitab = _mm_cvttps_epi32(ewrt);
1718 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1719 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1720 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1722 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1723 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1727 /* Calculate temporary vectorial force */
1728 tx = _mm_mul_ps(fscal,dx11);
1729 ty = _mm_mul_ps(fscal,dy11);
1730 tz = _mm_mul_ps(fscal,dz11);
1732 /* Update vectorial force */
1733 fix1 = _mm_add_ps(fix1,tx);
1734 fiy1 = _mm_add_ps(fiy1,ty);
1735 fiz1 = _mm_add_ps(fiz1,tz);
1737 fjx1 = _mm_add_ps(fjx1,tx);
1738 fjy1 = _mm_add_ps(fjy1,ty);
1739 fjz1 = _mm_add_ps(fjz1,tz);
1741 /**************************
1742 * CALCULATE INTERACTIONS *
1743 **************************/
1745 r12 = _mm_mul_ps(rsq12,rinv12);
1747 /* EWALD ELECTROSTATICS */
1749 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1750 ewrt = _mm_mul_ps(r12,ewtabscale);
1751 ewitab = _mm_cvttps_epi32(ewrt);
1752 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1753 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1754 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1756 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1757 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1761 /* Calculate temporary vectorial force */
1762 tx = _mm_mul_ps(fscal,dx12);
1763 ty = _mm_mul_ps(fscal,dy12);
1764 tz = _mm_mul_ps(fscal,dz12);
1766 /* Update vectorial force */
1767 fix1 = _mm_add_ps(fix1,tx);
1768 fiy1 = _mm_add_ps(fiy1,ty);
1769 fiz1 = _mm_add_ps(fiz1,tz);
1771 fjx2 = _mm_add_ps(fjx2,tx);
1772 fjy2 = _mm_add_ps(fjy2,ty);
1773 fjz2 = _mm_add_ps(fjz2,tz);
1775 /**************************
1776 * CALCULATE INTERACTIONS *
1777 **************************/
1779 r13 = _mm_mul_ps(rsq13,rinv13);
1781 /* EWALD ELECTROSTATICS */
1783 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1784 ewrt = _mm_mul_ps(r13,ewtabscale);
1785 ewitab = _mm_cvttps_epi32(ewrt);
1786 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1787 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1788 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1790 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1791 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1795 /* Calculate temporary vectorial force */
1796 tx = _mm_mul_ps(fscal,dx13);
1797 ty = _mm_mul_ps(fscal,dy13);
1798 tz = _mm_mul_ps(fscal,dz13);
1800 /* Update vectorial force */
1801 fix1 = _mm_add_ps(fix1,tx);
1802 fiy1 = _mm_add_ps(fiy1,ty);
1803 fiz1 = _mm_add_ps(fiz1,tz);
1805 fjx3 = _mm_add_ps(fjx3,tx);
1806 fjy3 = _mm_add_ps(fjy3,ty);
1807 fjz3 = _mm_add_ps(fjz3,tz);
1809 /**************************
1810 * CALCULATE INTERACTIONS *
1811 **************************/
1813 r21 = _mm_mul_ps(rsq21,rinv21);
1815 /* EWALD ELECTROSTATICS */
1817 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1818 ewrt = _mm_mul_ps(r21,ewtabscale);
1819 ewitab = _mm_cvttps_epi32(ewrt);
1820 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1821 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1822 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1824 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1825 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1829 /* Calculate temporary vectorial force */
1830 tx = _mm_mul_ps(fscal,dx21);
1831 ty = _mm_mul_ps(fscal,dy21);
1832 tz = _mm_mul_ps(fscal,dz21);
1834 /* Update vectorial force */
1835 fix2 = _mm_add_ps(fix2,tx);
1836 fiy2 = _mm_add_ps(fiy2,ty);
1837 fiz2 = _mm_add_ps(fiz2,tz);
1839 fjx1 = _mm_add_ps(fjx1,tx);
1840 fjy1 = _mm_add_ps(fjy1,ty);
1841 fjz1 = _mm_add_ps(fjz1,tz);
1843 /**************************
1844 * CALCULATE INTERACTIONS *
1845 **************************/
1847 r22 = _mm_mul_ps(rsq22,rinv22);
1849 /* EWALD ELECTROSTATICS */
1851 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1852 ewrt = _mm_mul_ps(r22,ewtabscale);
1853 ewitab = _mm_cvttps_epi32(ewrt);
1854 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1855 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1856 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1858 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1859 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1863 /* Calculate temporary vectorial force */
1864 tx = _mm_mul_ps(fscal,dx22);
1865 ty = _mm_mul_ps(fscal,dy22);
1866 tz = _mm_mul_ps(fscal,dz22);
1868 /* Update vectorial force */
1869 fix2 = _mm_add_ps(fix2,tx);
1870 fiy2 = _mm_add_ps(fiy2,ty);
1871 fiz2 = _mm_add_ps(fiz2,tz);
1873 fjx2 = _mm_add_ps(fjx2,tx);
1874 fjy2 = _mm_add_ps(fjy2,ty);
1875 fjz2 = _mm_add_ps(fjz2,tz);
1877 /**************************
1878 * CALCULATE INTERACTIONS *
1879 **************************/
1881 r23 = _mm_mul_ps(rsq23,rinv23);
1883 /* EWALD ELECTROSTATICS */
1885 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1886 ewrt = _mm_mul_ps(r23,ewtabscale);
1887 ewitab = _mm_cvttps_epi32(ewrt);
1888 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1889 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1890 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1892 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1893 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1897 /* Calculate temporary vectorial force */
1898 tx = _mm_mul_ps(fscal,dx23);
1899 ty = _mm_mul_ps(fscal,dy23);
1900 tz = _mm_mul_ps(fscal,dz23);
1902 /* Update vectorial force */
1903 fix2 = _mm_add_ps(fix2,tx);
1904 fiy2 = _mm_add_ps(fiy2,ty);
1905 fiz2 = _mm_add_ps(fiz2,tz);
1907 fjx3 = _mm_add_ps(fjx3,tx);
1908 fjy3 = _mm_add_ps(fjy3,ty);
1909 fjz3 = _mm_add_ps(fjz3,tz);
1911 /**************************
1912 * CALCULATE INTERACTIONS *
1913 **************************/
1915 r31 = _mm_mul_ps(rsq31,rinv31);
1917 /* EWALD ELECTROSTATICS */
1919 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1920 ewrt = _mm_mul_ps(r31,ewtabscale);
1921 ewitab = _mm_cvttps_epi32(ewrt);
1922 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1923 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1924 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1926 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1927 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1931 /* Calculate temporary vectorial force */
1932 tx = _mm_mul_ps(fscal,dx31);
1933 ty = _mm_mul_ps(fscal,dy31);
1934 tz = _mm_mul_ps(fscal,dz31);
1936 /* Update vectorial force */
1937 fix3 = _mm_add_ps(fix3,tx);
1938 fiy3 = _mm_add_ps(fiy3,ty);
1939 fiz3 = _mm_add_ps(fiz3,tz);
1941 fjx1 = _mm_add_ps(fjx1,tx);
1942 fjy1 = _mm_add_ps(fjy1,ty);
1943 fjz1 = _mm_add_ps(fjz1,tz);
1945 /**************************
1946 * CALCULATE INTERACTIONS *
1947 **************************/
1949 r32 = _mm_mul_ps(rsq32,rinv32);
1951 /* EWALD ELECTROSTATICS */
1953 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1954 ewrt = _mm_mul_ps(r32,ewtabscale);
1955 ewitab = _mm_cvttps_epi32(ewrt);
1956 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1957 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1958 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1960 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1961 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1965 /* Calculate temporary vectorial force */
1966 tx = _mm_mul_ps(fscal,dx32);
1967 ty = _mm_mul_ps(fscal,dy32);
1968 tz = _mm_mul_ps(fscal,dz32);
1970 /* Update vectorial force */
1971 fix3 = _mm_add_ps(fix3,tx);
1972 fiy3 = _mm_add_ps(fiy3,ty);
1973 fiz3 = _mm_add_ps(fiz3,tz);
1975 fjx2 = _mm_add_ps(fjx2,tx);
1976 fjy2 = _mm_add_ps(fjy2,ty);
1977 fjz2 = _mm_add_ps(fjz2,tz);
1979 /**************************
1980 * CALCULATE INTERACTIONS *
1981 **************************/
1983 r33 = _mm_mul_ps(rsq33,rinv33);
1985 /* EWALD ELECTROSTATICS */
1987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1988 ewrt = _mm_mul_ps(r33,ewtabscale);
1989 ewitab = _mm_cvttps_epi32(ewrt);
1990 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1991 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1992 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1994 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1995 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1999 /* Calculate temporary vectorial force */
2000 tx = _mm_mul_ps(fscal,dx33);
2001 ty = _mm_mul_ps(fscal,dy33);
2002 tz = _mm_mul_ps(fscal,dz33);
2004 /* Update vectorial force */
2005 fix3 = _mm_add_ps(fix3,tx);
2006 fiy3 = _mm_add_ps(fiy3,ty);
2007 fiz3 = _mm_add_ps(fiz3,tz);
2009 fjx3 = _mm_add_ps(fjx3,tx);
2010 fjy3 = _mm_add_ps(fjy3,ty);
2011 fjz3 = _mm_add_ps(fjz3,tz);
2013 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2014 f+j_coord_offsetC,f+j_coord_offsetD,
2015 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2016 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2018 /* Inner loop uses 375 flops */
2021 if(jidx<j_index_end)
2024 /* Get j neighbor index, and coordinate index */
2026 jnrB = jjnr[jidx+1];
2027 jnrC = jjnr[jidx+2];
2028 jnrD = jjnr[jidx+3];
2030 /* Sign of each element will be negative for non-real atoms.
2031 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2032 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2034 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2035 jnrA = (jnrA>=0) ? jnrA : 0;
2036 jnrB = (jnrB>=0) ? jnrB : 0;
2037 jnrC = (jnrC>=0) ? jnrC : 0;
2038 jnrD = (jnrD>=0) ? jnrD : 0;
2040 j_coord_offsetA = DIM*jnrA;
2041 j_coord_offsetB = DIM*jnrB;
2042 j_coord_offsetC = DIM*jnrC;
2043 j_coord_offsetD = DIM*jnrD;
2045 /* load j atom coordinates */
2046 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2047 x+j_coord_offsetC,x+j_coord_offsetD,
2048 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2049 &jy2,&jz2,&jx3,&jy3,&jz3);
2051 /* Calculate displacement vector */
2052 dx00 = _mm_sub_ps(ix0,jx0);
2053 dy00 = _mm_sub_ps(iy0,jy0);
2054 dz00 = _mm_sub_ps(iz0,jz0);
2055 dx11 = _mm_sub_ps(ix1,jx1);
2056 dy11 = _mm_sub_ps(iy1,jy1);
2057 dz11 = _mm_sub_ps(iz1,jz1);
2058 dx12 = _mm_sub_ps(ix1,jx2);
2059 dy12 = _mm_sub_ps(iy1,jy2);
2060 dz12 = _mm_sub_ps(iz1,jz2);
2061 dx13 = _mm_sub_ps(ix1,jx3);
2062 dy13 = _mm_sub_ps(iy1,jy3);
2063 dz13 = _mm_sub_ps(iz1,jz3);
2064 dx21 = _mm_sub_ps(ix2,jx1);
2065 dy21 = _mm_sub_ps(iy2,jy1);
2066 dz21 = _mm_sub_ps(iz2,jz1);
2067 dx22 = _mm_sub_ps(ix2,jx2);
2068 dy22 = _mm_sub_ps(iy2,jy2);
2069 dz22 = _mm_sub_ps(iz2,jz2);
2070 dx23 = _mm_sub_ps(ix2,jx3);
2071 dy23 = _mm_sub_ps(iy2,jy3);
2072 dz23 = _mm_sub_ps(iz2,jz3);
2073 dx31 = _mm_sub_ps(ix3,jx1);
2074 dy31 = _mm_sub_ps(iy3,jy1);
2075 dz31 = _mm_sub_ps(iz3,jz1);
2076 dx32 = _mm_sub_ps(ix3,jx2);
2077 dy32 = _mm_sub_ps(iy3,jy2);
2078 dz32 = _mm_sub_ps(iz3,jz2);
2079 dx33 = _mm_sub_ps(ix3,jx3);
2080 dy33 = _mm_sub_ps(iy3,jy3);
2081 dz33 = _mm_sub_ps(iz3,jz3);
2083 /* Calculate squared distance and things based on it */
2084 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2085 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2086 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2087 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2088 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2089 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2090 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2091 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2092 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2093 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2095 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2096 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2097 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2098 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2099 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2100 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2101 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2102 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2103 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2104 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2106 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
2107 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
2108 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
2109 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
2110 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
2111 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
2112 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
2113 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
2114 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
2116 fjx0 = _mm_setzero_ps();
2117 fjy0 = _mm_setzero_ps();
2118 fjz0 = _mm_setzero_ps();
2119 fjx1 = _mm_setzero_ps();
2120 fjy1 = _mm_setzero_ps();
2121 fjz1 = _mm_setzero_ps();
2122 fjx2 = _mm_setzero_ps();
2123 fjy2 = _mm_setzero_ps();
2124 fjz2 = _mm_setzero_ps();
2125 fjx3 = _mm_setzero_ps();
2126 fjy3 = _mm_setzero_ps();
2127 fjz3 = _mm_setzero_ps();
2129 /**************************
2130 * CALCULATE INTERACTIONS *
2131 **************************/
2133 r00 = _mm_mul_ps(rsq00,rinv00);
2134 r00 = _mm_andnot_ps(dummy_mask,r00);
2136 /* Calculate table index by multiplying r with table scale and truncate to integer */
2137 rt = _mm_mul_ps(r00,vftabscale);
2138 vfitab = _mm_cvttps_epi32(rt);
2139 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2140 vfitab = _mm_slli_epi32(vfitab,3);
2142 /* CUBIC SPLINE TABLE DISPERSION */
2143 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2144 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2145 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2146 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2147 _MM_TRANSPOSE4_PS(Y,F,G,H);
2148 Heps = _mm_mul_ps(vfeps,H);
2149 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2150 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2151 fvdw6 = _mm_mul_ps(c6_00,FF);
2153 /* CUBIC SPLINE TABLE REPULSION */
2154 vfitab = _mm_add_epi32(vfitab,ifour);
2155 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2156 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2157 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2158 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2159 _MM_TRANSPOSE4_PS(Y,F,G,H);
2160 Heps = _mm_mul_ps(vfeps,H);
2161 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2162 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2163 fvdw12 = _mm_mul_ps(c12_00,FF);
2164 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2168 fscal = _mm_andnot_ps(dummy_mask,fscal);
2170 /* Calculate temporary vectorial force */
2171 tx = _mm_mul_ps(fscal,dx00);
2172 ty = _mm_mul_ps(fscal,dy00);
2173 tz = _mm_mul_ps(fscal,dz00);
2175 /* Update vectorial force */
2176 fix0 = _mm_add_ps(fix0,tx);
2177 fiy0 = _mm_add_ps(fiy0,ty);
2178 fiz0 = _mm_add_ps(fiz0,tz);
2180 fjx0 = _mm_add_ps(fjx0,tx);
2181 fjy0 = _mm_add_ps(fjy0,ty);
2182 fjz0 = _mm_add_ps(fjz0,tz);
2184 /**************************
2185 * CALCULATE INTERACTIONS *
2186 **************************/
2188 r11 = _mm_mul_ps(rsq11,rinv11);
2189 r11 = _mm_andnot_ps(dummy_mask,r11);
2191 /* EWALD ELECTROSTATICS */
2193 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2194 ewrt = _mm_mul_ps(r11,ewtabscale);
2195 ewitab = _mm_cvttps_epi32(ewrt);
2196 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2197 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2198 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2200 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2201 felec = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2205 fscal = _mm_andnot_ps(dummy_mask,fscal);
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_ps(fscal,dx11);
2209 ty = _mm_mul_ps(fscal,dy11);
2210 tz = _mm_mul_ps(fscal,dz11);
2212 /* Update vectorial force */
2213 fix1 = _mm_add_ps(fix1,tx);
2214 fiy1 = _mm_add_ps(fiy1,ty);
2215 fiz1 = _mm_add_ps(fiz1,tz);
2217 fjx1 = _mm_add_ps(fjx1,tx);
2218 fjy1 = _mm_add_ps(fjy1,ty);
2219 fjz1 = _mm_add_ps(fjz1,tz);
2221 /**************************
2222 * CALCULATE INTERACTIONS *
2223 **************************/
2225 r12 = _mm_mul_ps(rsq12,rinv12);
2226 r12 = _mm_andnot_ps(dummy_mask,r12);
2228 /* EWALD ELECTROSTATICS */
2230 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231 ewrt = _mm_mul_ps(r12,ewtabscale);
2232 ewitab = _mm_cvttps_epi32(ewrt);
2233 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2234 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2235 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2237 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2238 felec = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2242 fscal = _mm_andnot_ps(dummy_mask,fscal);
2244 /* Calculate temporary vectorial force */
2245 tx = _mm_mul_ps(fscal,dx12);
2246 ty = _mm_mul_ps(fscal,dy12);
2247 tz = _mm_mul_ps(fscal,dz12);
2249 /* Update vectorial force */
2250 fix1 = _mm_add_ps(fix1,tx);
2251 fiy1 = _mm_add_ps(fiy1,ty);
2252 fiz1 = _mm_add_ps(fiz1,tz);
2254 fjx2 = _mm_add_ps(fjx2,tx);
2255 fjy2 = _mm_add_ps(fjy2,ty);
2256 fjz2 = _mm_add_ps(fjz2,tz);
2258 /**************************
2259 * CALCULATE INTERACTIONS *
2260 **************************/
2262 r13 = _mm_mul_ps(rsq13,rinv13);
2263 r13 = _mm_andnot_ps(dummy_mask,r13);
2265 /* EWALD ELECTROSTATICS */
2267 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2268 ewrt = _mm_mul_ps(r13,ewtabscale);
2269 ewitab = _mm_cvttps_epi32(ewrt);
2270 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2271 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2272 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2274 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2275 felec = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2279 fscal = _mm_andnot_ps(dummy_mask,fscal);
2281 /* Calculate temporary vectorial force */
2282 tx = _mm_mul_ps(fscal,dx13);
2283 ty = _mm_mul_ps(fscal,dy13);
2284 tz = _mm_mul_ps(fscal,dz13);
2286 /* Update vectorial force */
2287 fix1 = _mm_add_ps(fix1,tx);
2288 fiy1 = _mm_add_ps(fiy1,ty);
2289 fiz1 = _mm_add_ps(fiz1,tz);
2291 fjx3 = _mm_add_ps(fjx3,tx);
2292 fjy3 = _mm_add_ps(fjy3,ty);
2293 fjz3 = _mm_add_ps(fjz3,tz);
2295 /**************************
2296 * CALCULATE INTERACTIONS *
2297 **************************/
2299 r21 = _mm_mul_ps(rsq21,rinv21);
2300 r21 = _mm_andnot_ps(dummy_mask,r21);
2302 /* EWALD ELECTROSTATICS */
2304 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2305 ewrt = _mm_mul_ps(r21,ewtabscale);
2306 ewitab = _mm_cvttps_epi32(ewrt);
2307 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2308 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2309 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2311 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2312 felec = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2316 fscal = _mm_andnot_ps(dummy_mask,fscal);
2318 /* Calculate temporary vectorial force */
2319 tx = _mm_mul_ps(fscal,dx21);
2320 ty = _mm_mul_ps(fscal,dy21);
2321 tz = _mm_mul_ps(fscal,dz21);
2323 /* Update vectorial force */
2324 fix2 = _mm_add_ps(fix2,tx);
2325 fiy2 = _mm_add_ps(fiy2,ty);
2326 fiz2 = _mm_add_ps(fiz2,tz);
2328 fjx1 = _mm_add_ps(fjx1,tx);
2329 fjy1 = _mm_add_ps(fjy1,ty);
2330 fjz1 = _mm_add_ps(fjz1,tz);
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 r22 = _mm_mul_ps(rsq22,rinv22);
2337 r22 = _mm_andnot_ps(dummy_mask,r22);
2339 /* EWALD ELECTROSTATICS */
2341 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2342 ewrt = _mm_mul_ps(r22,ewtabscale);
2343 ewitab = _mm_cvttps_epi32(ewrt);
2344 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2345 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2346 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2348 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2349 felec = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2353 fscal = _mm_andnot_ps(dummy_mask,fscal);
2355 /* Calculate temporary vectorial force */
2356 tx = _mm_mul_ps(fscal,dx22);
2357 ty = _mm_mul_ps(fscal,dy22);
2358 tz = _mm_mul_ps(fscal,dz22);
2360 /* Update vectorial force */
2361 fix2 = _mm_add_ps(fix2,tx);
2362 fiy2 = _mm_add_ps(fiy2,ty);
2363 fiz2 = _mm_add_ps(fiz2,tz);
2365 fjx2 = _mm_add_ps(fjx2,tx);
2366 fjy2 = _mm_add_ps(fjy2,ty);
2367 fjz2 = _mm_add_ps(fjz2,tz);
2369 /**************************
2370 * CALCULATE INTERACTIONS *
2371 **************************/
2373 r23 = _mm_mul_ps(rsq23,rinv23);
2374 r23 = _mm_andnot_ps(dummy_mask,r23);
2376 /* EWALD ELECTROSTATICS */
2378 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2379 ewrt = _mm_mul_ps(r23,ewtabscale);
2380 ewitab = _mm_cvttps_epi32(ewrt);
2381 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2382 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2383 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2385 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2386 felec = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2390 fscal = _mm_andnot_ps(dummy_mask,fscal);
2392 /* Calculate temporary vectorial force */
2393 tx = _mm_mul_ps(fscal,dx23);
2394 ty = _mm_mul_ps(fscal,dy23);
2395 tz = _mm_mul_ps(fscal,dz23);
2397 /* Update vectorial force */
2398 fix2 = _mm_add_ps(fix2,tx);
2399 fiy2 = _mm_add_ps(fiy2,ty);
2400 fiz2 = _mm_add_ps(fiz2,tz);
2402 fjx3 = _mm_add_ps(fjx3,tx);
2403 fjy3 = _mm_add_ps(fjy3,ty);
2404 fjz3 = _mm_add_ps(fjz3,tz);
2406 /**************************
2407 * CALCULATE INTERACTIONS *
2408 **************************/
2410 r31 = _mm_mul_ps(rsq31,rinv31);
2411 r31 = _mm_andnot_ps(dummy_mask,r31);
2413 /* EWALD ELECTROSTATICS */
2415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2416 ewrt = _mm_mul_ps(r31,ewtabscale);
2417 ewitab = _mm_cvttps_epi32(ewrt);
2418 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2419 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2420 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2422 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2423 felec = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2427 fscal = _mm_andnot_ps(dummy_mask,fscal);
2429 /* Calculate temporary vectorial force */
2430 tx = _mm_mul_ps(fscal,dx31);
2431 ty = _mm_mul_ps(fscal,dy31);
2432 tz = _mm_mul_ps(fscal,dz31);
2434 /* Update vectorial force */
2435 fix3 = _mm_add_ps(fix3,tx);
2436 fiy3 = _mm_add_ps(fiy3,ty);
2437 fiz3 = _mm_add_ps(fiz3,tz);
2439 fjx1 = _mm_add_ps(fjx1,tx);
2440 fjy1 = _mm_add_ps(fjy1,ty);
2441 fjz1 = _mm_add_ps(fjz1,tz);
2443 /**************************
2444 * CALCULATE INTERACTIONS *
2445 **************************/
2447 r32 = _mm_mul_ps(rsq32,rinv32);
2448 r32 = _mm_andnot_ps(dummy_mask,r32);
2450 /* EWALD ELECTROSTATICS */
2452 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2453 ewrt = _mm_mul_ps(r32,ewtabscale);
2454 ewitab = _mm_cvttps_epi32(ewrt);
2455 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2456 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2457 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2459 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2460 felec = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2464 fscal = _mm_andnot_ps(dummy_mask,fscal);
2466 /* Calculate temporary vectorial force */
2467 tx = _mm_mul_ps(fscal,dx32);
2468 ty = _mm_mul_ps(fscal,dy32);
2469 tz = _mm_mul_ps(fscal,dz32);
2471 /* Update vectorial force */
2472 fix3 = _mm_add_ps(fix3,tx);
2473 fiy3 = _mm_add_ps(fiy3,ty);
2474 fiz3 = _mm_add_ps(fiz3,tz);
2476 fjx2 = _mm_add_ps(fjx2,tx);
2477 fjy2 = _mm_add_ps(fjy2,ty);
2478 fjz2 = _mm_add_ps(fjz2,tz);
2480 /**************************
2481 * CALCULATE INTERACTIONS *
2482 **************************/
2484 r33 = _mm_mul_ps(rsq33,rinv33);
2485 r33 = _mm_andnot_ps(dummy_mask,r33);
2487 /* EWALD ELECTROSTATICS */
2489 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2490 ewrt = _mm_mul_ps(r33,ewtabscale);
2491 ewitab = _mm_cvttps_epi32(ewrt);
2492 eweps = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2493 gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2494 ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2496 felec = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2497 felec = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2501 fscal = _mm_andnot_ps(dummy_mask,fscal);
2503 /* Calculate temporary vectorial force */
2504 tx = _mm_mul_ps(fscal,dx33);
2505 ty = _mm_mul_ps(fscal,dy33);
2506 tz = _mm_mul_ps(fscal,dz33);
2508 /* Update vectorial force */
2509 fix3 = _mm_add_ps(fix3,tx);
2510 fiy3 = _mm_add_ps(fiy3,ty);
2511 fiz3 = _mm_add_ps(fiz3,tz);
2513 fjx3 = _mm_add_ps(fjx3,tx);
2514 fjy3 = _mm_add_ps(fjy3,ty);
2515 fjz3 = _mm_add_ps(fjz3,tz);
2517 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2518 f+j_coord_offsetC,f+j_coord_offsetD,
2519 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2520 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2522 /* Inner loop uses 385 flops */
2525 /* End of innermost loop */
2527 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2528 f+i_coord_offset,fshift+i_shift_offset);
2530 /* Increment number of inner iterations */
2531 inneriter += j_index_end - j_index_start;
2533 /* Outer loop uses 36 flops */
2536 /* Increment number of outer iterations */
2539 /* Update outer/inner flops */
2541 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*385);