2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128 dummy_mask,cutoff_mask;
105 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106 __m128 one = _mm_set1_ps(1.0);
107 __m128 two = _mm_set1_ps(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_ps(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_elec_vdw->data;
126 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 jq1 = _mm_set1_ps(charge[inr+1]);
136 jq2 = _mm_set1_ps(charge[inr+2]);
137 jq3 = _mm_set1_ps(charge[inr+3]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq11 = _mm_mul_ps(iq1,jq1);
142 qq12 = _mm_mul_ps(iq1,jq2);
143 qq13 = _mm_mul_ps(iq1,jq3);
144 qq21 = _mm_mul_ps(iq2,jq1);
145 qq22 = _mm_mul_ps(iq2,jq2);
146 qq23 = _mm_mul_ps(iq2,jq3);
147 qq31 = _mm_mul_ps(iq3,jq1);
148 qq32 = _mm_mul_ps(iq3,jq2);
149 qq33 = _mm_mul_ps(iq3,jq3);
151 /* Avoid stupid compiler warnings */
152 jnrA = jnrB = jnrC = jnrD = 0;
161 /* Start outer loop over neighborlists */
162 for(iidx=0; iidx<nri; iidx++)
164 /* Load shift vector for this list */
165 i_shift_offset = DIM*shiftidx[iidx];
166 shX = shiftvec[i_shift_offset+XX];
167 shY = shiftvec[i_shift_offset+YY];
168 shZ = shiftvec[i_shift_offset+ZZ];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
180 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
181 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
182 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
183 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
184 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
185 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
186 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
187 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
188 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
189 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
190 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
192 fix0 = _mm_setzero_ps();
193 fiy0 = _mm_setzero_ps();
194 fiz0 = _mm_setzero_ps();
195 fix1 = _mm_setzero_ps();
196 fiy1 = _mm_setzero_ps();
197 fiz1 = _mm_setzero_ps();
198 fix2 = _mm_setzero_ps();
199 fiy2 = _mm_setzero_ps();
200 fiz2 = _mm_setzero_ps();
201 fix3 = _mm_setzero_ps();
202 fiy3 = _mm_setzero_ps();
203 fiz3 = _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum = _mm_setzero_ps();
207 vvdwsum = _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
228 &jy2,&jz2,&jx3,&jy3,&jz3);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx11 = _mm_sub_ps(ix1,jx1);
235 dy11 = _mm_sub_ps(iy1,jy1);
236 dz11 = _mm_sub_ps(iz1,jz1);
237 dx12 = _mm_sub_ps(ix1,jx2);
238 dy12 = _mm_sub_ps(iy1,jy2);
239 dz12 = _mm_sub_ps(iz1,jz2);
240 dx13 = _mm_sub_ps(ix1,jx3);
241 dy13 = _mm_sub_ps(iy1,jy3);
242 dz13 = _mm_sub_ps(iz1,jz3);
243 dx21 = _mm_sub_ps(ix2,jx1);
244 dy21 = _mm_sub_ps(iy2,jy1);
245 dz21 = _mm_sub_ps(iz2,jz1);
246 dx22 = _mm_sub_ps(ix2,jx2);
247 dy22 = _mm_sub_ps(iy2,jy2);
248 dz22 = _mm_sub_ps(iz2,jz2);
249 dx23 = _mm_sub_ps(ix2,jx3);
250 dy23 = _mm_sub_ps(iy2,jy3);
251 dz23 = _mm_sub_ps(iz2,jz3);
252 dx31 = _mm_sub_ps(ix3,jx1);
253 dy31 = _mm_sub_ps(iy3,jy1);
254 dz31 = _mm_sub_ps(iz3,jz1);
255 dx32 = _mm_sub_ps(ix3,jx2);
256 dy32 = _mm_sub_ps(iy3,jy2);
257 dz32 = _mm_sub_ps(iz3,jz2);
258 dx33 = _mm_sub_ps(ix3,jx3);
259 dy33 = _mm_sub_ps(iy3,jy3);
260 dz33 = _mm_sub_ps(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
270 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
271 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
272 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
274 rinv00 = gmx_mm_invsqrt_ps(rsq00);
275 rinv11 = gmx_mm_invsqrt_ps(rsq11);
276 rinv12 = gmx_mm_invsqrt_ps(rsq12);
277 rinv13 = gmx_mm_invsqrt_ps(rsq13);
278 rinv21 = gmx_mm_invsqrt_ps(rsq21);
279 rinv22 = gmx_mm_invsqrt_ps(rsq22);
280 rinv23 = gmx_mm_invsqrt_ps(rsq23);
281 rinv31 = gmx_mm_invsqrt_ps(rsq31);
282 rinv32 = gmx_mm_invsqrt_ps(rsq32);
283 rinv33 = gmx_mm_invsqrt_ps(rsq33);
285 fjx0 = _mm_setzero_ps();
286 fjy0 = _mm_setzero_ps();
287 fjz0 = _mm_setzero_ps();
288 fjx1 = _mm_setzero_ps();
289 fjy1 = _mm_setzero_ps();
290 fjz1 = _mm_setzero_ps();
291 fjx2 = _mm_setzero_ps();
292 fjy2 = _mm_setzero_ps();
293 fjz2 = _mm_setzero_ps();
294 fjx3 = _mm_setzero_ps();
295 fjy3 = _mm_setzero_ps();
296 fjz3 = _mm_setzero_ps();
298 /**************************
299 * CALCULATE INTERACTIONS *
300 **************************/
302 r00 = _mm_mul_ps(rsq00,rinv00);
304 /* Calculate table index by multiplying r with table scale and truncate to integer */
305 rt = _mm_mul_ps(r00,vftabscale);
306 vfitab = _mm_cvttps_epi32(rt);
307 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
308 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
310 /* CUBIC SPLINE TABLE DISPERSION */
311 vfitab = _mm_add_epi32(vfitab,ifour);
312 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
313 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
314 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
315 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
316 _MM_TRANSPOSE4_PS(Y,F,G,H);
317 Heps = _mm_mul_ps(vfeps,H);
318 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
319 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
320 vvdw6 = _mm_mul_ps(c6_00,VV);
321 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
322 fvdw6 = _mm_mul_ps(c6_00,FF);
324 /* CUBIC SPLINE TABLE REPULSION */
325 vfitab = _mm_add_epi32(vfitab,ifour);
326 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
327 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
328 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
329 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
330 _MM_TRANSPOSE4_PS(Y,F,G,H);
331 Heps = _mm_mul_ps(vfeps,H);
332 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
333 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
334 vvdw12 = _mm_mul_ps(c12_00,VV);
335 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
336 fvdw12 = _mm_mul_ps(c12_00,FF);
337 vvdw = _mm_add_ps(vvdw12,vvdw6);
338 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
340 /* Update potential sum for this i atom from the interaction with this j atom. */
341 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
345 /* Calculate temporary vectorial force */
346 tx = _mm_mul_ps(fscal,dx00);
347 ty = _mm_mul_ps(fscal,dy00);
348 tz = _mm_mul_ps(fscal,dz00);
350 /* Update vectorial force */
351 fix0 = _mm_add_ps(fix0,tx);
352 fiy0 = _mm_add_ps(fiy0,ty);
353 fiz0 = _mm_add_ps(fiz0,tz);
355 fjx0 = _mm_add_ps(fjx0,tx);
356 fjy0 = _mm_add_ps(fjy0,ty);
357 fjz0 = _mm_add_ps(fjz0,tz);
359 /**************************
360 * CALCULATE INTERACTIONS *
361 **************************/
363 r11 = _mm_mul_ps(rsq11,rinv11);
365 /* Calculate table index by multiplying r with table scale and truncate to integer */
366 rt = _mm_mul_ps(r11,vftabscale);
367 vfitab = _mm_cvttps_epi32(rt);
368 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
369 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
371 /* CUBIC SPLINE TABLE ELECTROSTATICS */
372 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
373 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
374 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
375 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
376 _MM_TRANSPOSE4_PS(Y,F,G,H);
377 Heps = _mm_mul_ps(vfeps,H);
378 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
379 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
380 velec = _mm_mul_ps(qq11,VV);
381 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
382 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
384 /* Update potential sum for this i atom from the interaction with this j atom. */
385 velecsum = _mm_add_ps(velecsum,velec);
389 /* Calculate temporary vectorial force */
390 tx = _mm_mul_ps(fscal,dx11);
391 ty = _mm_mul_ps(fscal,dy11);
392 tz = _mm_mul_ps(fscal,dz11);
394 /* Update vectorial force */
395 fix1 = _mm_add_ps(fix1,tx);
396 fiy1 = _mm_add_ps(fiy1,ty);
397 fiz1 = _mm_add_ps(fiz1,tz);
399 fjx1 = _mm_add_ps(fjx1,tx);
400 fjy1 = _mm_add_ps(fjy1,ty);
401 fjz1 = _mm_add_ps(fjz1,tz);
403 /**************************
404 * CALCULATE INTERACTIONS *
405 **************************/
407 r12 = _mm_mul_ps(rsq12,rinv12);
409 /* Calculate table index by multiplying r with table scale and truncate to integer */
410 rt = _mm_mul_ps(r12,vftabscale);
411 vfitab = _mm_cvttps_epi32(rt);
412 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
413 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
415 /* CUBIC SPLINE TABLE ELECTROSTATICS */
416 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
417 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
418 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
419 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
420 _MM_TRANSPOSE4_PS(Y,F,G,H);
421 Heps = _mm_mul_ps(vfeps,H);
422 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
423 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
424 velec = _mm_mul_ps(qq12,VV);
425 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
426 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velecsum = _mm_add_ps(velecsum,velec);
433 /* Calculate temporary vectorial force */
434 tx = _mm_mul_ps(fscal,dx12);
435 ty = _mm_mul_ps(fscal,dy12);
436 tz = _mm_mul_ps(fscal,dz12);
438 /* Update vectorial force */
439 fix1 = _mm_add_ps(fix1,tx);
440 fiy1 = _mm_add_ps(fiy1,ty);
441 fiz1 = _mm_add_ps(fiz1,tz);
443 fjx2 = _mm_add_ps(fjx2,tx);
444 fjy2 = _mm_add_ps(fjy2,ty);
445 fjz2 = _mm_add_ps(fjz2,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 r13 = _mm_mul_ps(rsq13,rinv13);
453 /* Calculate table index by multiplying r with table scale and truncate to integer */
454 rt = _mm_mul_ps(r13,vftabscale);
455 vfitab = _mm_cvttps_epi32(rt);
456 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
457 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
459 /* CUBIC SPLINE TABLE ELECTROSTATICS */
460 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
461 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
462 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
463 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
464 _MM_TRANSPOSE4_PS(Y,F,G,H);
465 Heps = _mm_mul_ps(vfeps,H);
466 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
467 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
468 velec = _mm_mul_ps(qq13,VV);
469 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
470 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
472 /* Update potential sum for this i atom from the interaction with this j atom. */
473 velecsum = _mm_add_ps(velecsum,velec);
477 /* Calculate temporary vectorial force */
478 tx = _mm_mul_ps(fscal,dx13);
479 ty = _mm_mul_ps(fscal,dy13);
480 tz = _mm_mul_ps(fscal,dz13);
482 /* Update vectorial force */
483 fix1 = _mm_add_ps(fix1,tx);
484 fiy1 = _mm_add_ps(fiy1,ty);
485 fiz1 = _mm_add_ps(fiz1,tz);
487 fjx3 = _mm_add_ps(fjx3,tx);
488 fjy3 = _mm_add_ps(fjy3,ty);
489 fjz3 = _mm_add_ps(fjz3,tz);
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 r21 = _mm_mul_ps(rsq21,rinv21);
497 /* Calculate table index by multiplying r with table scale and truncate to integer */
498 rt = _mm_mul_ps(r21,vftabscale);
499 vfitab = _mm_cvttps_epi32(rt);
500 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
501 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
503 /* CUBIC SPLINE TABLE ELECTROSTATICS */
504 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
505 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
506 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
507 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
508 _MM_TRANSPOSE4_PS(Y,F,G,H);
509 Heps = _mm_mul_ps(vfeps,H);
510 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
511 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
512 velec = _mm_mul_ps(qq21,VV);
513 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
514 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
516 /* Update potential sum for this i atom from the interaction with this j atom. */
517 velecsum = _mm_add_ps(velecsum,velec);
521 /* Calculate temporary vectorial force */
522 tx = _mm_mul_ps(fscal,dx21);
523 ty = _mm_mul_ps(fscal,dy21);
524 tz = _mm_mul_ps(fscal,dz21);
526 /* Update vectorial force */
527 fix2 = _mm_add_ps(fix2,tx);
528 fiy2 = _mm_add_ps(fiy2,ty);
529 fiz2 = _mm_add_ps(fiz2,tz);
531 fjx1 = _mm_add_ps(fjx1,tx);
532 fjy1 = _mm_add_ps(fjy1,ty);
533 fjz1 = _mm_add_ps(fjz1,tz);
535 /**************************
536 * CALCULATE INTERACTIONS *
537 **************************/
539 r22 = _mm_mul_ps(rsq22,rinv22);
541 /* Calculate table index by multiplying r with table scale and truncate to integer */
542 rt = _mm_mul_ps(r22,vftabscale);
543 vfitab = _mm_cvttps_epi32(rt);
544 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
545 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
547 /* CUBIC SPLINE TABLE ELECTROSTATICS */
548 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
549 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
550 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
551 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
552 _MM_TRANSPOSE4_PS(Y,F,G,H);
553 Heps = _mm_mul_ps(vfeps,H);
554 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
555 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
556 velec = _mm_mul_ps(qq22,VV);
557 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
558 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velecsum = _mm_add_ps(velecsum,velec);
565 /* Calculate temporary vectorial force */
566 tx = _mm_mul_ps(fscal,dx22);
567 ty = _mm_mul_ps(fscal,dy22);
568 tz = _mm_mul_ps(fscal,dz22);
570 /* Update vectorial force */
571 fix2 = _mm_add_ps(fix2,tx);
572 fiy2 = _mm_add_ps(fiy2,ty);
573 fiz2 = _mm_add_ps(fiz2,tz);
575 fjx2 = _mm_add_ps(fjx2,tx);
576 fjy2 = _mm_add_ps(fjy2,ty);
577 fjz2 = _mm_add_ps(fjz2,tz);
579 /**************************
580 * CALCULATE INTERACTIONS *
581 **************************/
583 r23 = _mm_mul_ps(rsq23,rinv23);
585 /* Calculate table index by multiplying r with table scale and truncate to integer */
586 rt = _mm_mul_ps(r23,vftabscale);
587 vfitab = _mm_cvttps_epi32(rt);
588 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
589 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
591 /* CUBIC SPLINE TABLE ELECTROSTATICS */
592 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
593 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
594 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
595 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
596 _MM_TRANSPOSE4_PS(Y,F,G,H);
597 Heps = _mm_mul_ps(vfeps,H);
598 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
599 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
600 velec = _mm_mul_ps(qq23,VV);
601 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
602 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
604 /* Update potential sum for this i atom from the interaction with this j atom. */
605 velecsum = _mm_add_ps(velecsum,velec);
609 /* Calculate temporary vectorial force */
610 tx = _mm_mul_ps(fscal,dx23);
611 ty = _mm_mul_ps(fscal,dy23);
612 tz = _mm_mul_ps(fscal,dz23);
614 /* Update vectorial force */
615 fix2 = _mm_add_ps(fix2,tx);
616 fiy2 = _mm_add_ps(fiy2,ty);
617 fiz2 = _mm_add_ps(fiz2,tz);
619 fjx3 = _mm_add_ps(fjx3,tx);
620 fjy3 = _mm_add_ps(fjy3,ty);
621 fjz3 = _mm_add_ps(fjz3,tz);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 r31 = _mm_mul_ps(rsq31,rinv31);
629 /* Calculate table index by multiplying r with table scale and truncate to integer */
630 rt = _mm_mul_ps(r31,vftabscale);
631 vfitab = _mm_cvttps_epi32(rt);
632 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
633 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
635 /* CUBIC SPLINE TABLE ELECTROSTATICS */
636 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
637 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
638 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
639 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
640 _MM_TRANSPOSE4_PS(Y,F,G,H);
641 Heps = _mm_mul_ps(vfeps,H);
642 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
643 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
644 velec = _mm_mul_ps(qq31,VV);
645 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
646 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
648 /* Update potential sum for this i atom from the interaction with this j atom. */
649 velecsum = _mm_add_ps(velecsum,velec);
653 /* Calculate temporary vectorial force */
654 tx = _mm_mul_ps(fscal,dx31);
655 ty = _mm_mul_ps(fscal,dy31);
656 tz = _mm_mul_ps(fscal,dz31);
658 /* Update vectorial force */
659 fix3 = _mm_add_ps(fix3,tx);
660 fiy3 = _mm_add_ps(fiy3,ty);
661 fiz3 = _mm_add_ps(fiz3,tz);
663 fjx1 = _mm_add_ps(fjx1,tx);
664 fjy1 = _mm_add_ps(fjy1,ty);
665 fjz1 = _mm_add_ps(fjz1,tz);
667 /**************************
668 * CALCULATE INTERACTIONS *
669 **************************/
671 r32 = _mm_mul_ps(rsq32,rinv32);
673 /* Calculate table index by multiplying r with table scale and truncate to integer */
674 rt = _mm_mul_ps(r32,vftabscale);
675 vfitab = _mm_cvttps_epi32(rt);
676 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
677 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
679 /* CUBIC SPLINE TABLE ELECTROSTATICS */
680 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
681 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
682 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
683 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
684 _MM_TRANSPOSE4_PS(Y,F,G,H);
685 Heps = _mm_mul_ps(vfeps,H);
686 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
687 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
688 velec = _mm_mul_ps(qq32,VV);
689 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
690 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velecsum = _mm_add_ps(velecsum,velec);
697 /* Calculate temporary vectorial force */
698 tx = _mm_mul_ps(fscal,dx32);
699 ty = _mm_mul_ps(fscal,dy32);
700 tz = _mm_mul_ps(fscal,dz32);
702 /* Update vectorial force */
703 fix3 = _mm_add_ps(fix3,tx);
704 fiy3 = _mm_add_ps(fiy3,ty);
705 fiz3 = _mm_add_ps(fiz3,tz);
707 fjx2 = _mm_add_ps(fjx2,tx);
708 fjy2 = _mm_add_ps(fjy2,ty);
709 fjz2 = _mm_add_ps(fjz2,tz);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 r33 = _mm_mul_ps(rsq33,rinv33);
717 /* Calculate table index by multiplying r with table scale and truncate to integer */
718 rt = _mm_mul_ps(r33,vftabscale);
719 vfitab = _mm_cvttps_epi32(rt);
720 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
721 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
723 /* CUBIC SPLINE TABLE ELECTROSTATICS */
724 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
725 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
726 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
727 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
728 _MM_TRANSPOSE4_PS(Y,F,G,H);
729 Heps = _mm_mul_ps(vfeps,H);
730 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
731 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
732 velec = _mm_mul_ps(qq33,VV);
733 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
734 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velecsum = _mm_add_ps(velecsum,velec);
741 /* Calculate temporary vectorial force */
742 tx = _mm_mul_ps(fscal,dx33);
743 ty = _mm_mul_ps(fscal,dy33);
744 tz = _mm_mul_ps(fscal,dz33);
746 /* Update vectorial force */
747 fix3 = _mm_add_ps(fix3,tx);
748 fiy3 = _mm_add_ps(fiy3,ty);
749 fiz3 = _mm_add_ps(fiz3,tz);
751 fjx3 = _mm_add_ps(fjx3,tx);
752 fjy3 = _mm_add_ps(fjy3,ty);
753 fjz3 = _mm_add_ps(fjz3,tz);
755 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
756 f+j_coord_offsetC,f+j_coord_offsetD,
757 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
758 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
760 /* Inner loop uses 446 flops */
766 /* Get j neighbor index, and coordinate index */
772 /* Sign of each element will be negative for non-real atoms.
773 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
774 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
776 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
777 jnrA = (jnrA>=0) ? jnrA : 0;
778 jnrB = (jnrB>=0) ? jnrB : 0;
779 jnrC = (jnrC>=0) ? jnrC : 0;
780 jnrD = (jnrD>=0) ? jnrD : 0;
782 j_coord_offsetA = DIM*jnrA;
783 j_coord_offsetB = DIM*jnrB;
784 j_coord_offsetC = DIM*jnrC;
785 j_coord_offsetD = DIM*jnrD;
787 /* load j atom coordinates */
788 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
789 x+j_coord_offsetC,x+j_coord_offsetD,
790 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
791 &jy2,&jz2,&jx3,&jy3,&jz3);
793 /* Calculate displacement vector */
794 dx00 = _mm_sub_ps(ix0,jx0);
795 dy00 = _mm_sub_ps(iy0,jy0);
796 dz00 = _mm_sub_ps(iz0,jz0);
797 dx11 = _mm_sub_ps(ix1,jx1);
798 dy11 = _mm_sub_ps(iy1,jy1);
799 dz11 = _mm_sub_ps(iz1,jz1);
800 dx12 = _mm_sub_ps(ix1,jx2);
801 dy12 = _mm_sub_ps(iy1,jy2);
802 dz12 = _mm_sub_ps(iz1,jz2);
803 dx13 = _mm_sub_ps(ix1,jx3);
804 dy13 = _mm_sub_ps(iy1,jy3);
805 dz13 = _mm_sub_ps(iz1,jz3);
806 dx21 = _mm_sub_ps(ix2,jx1);
807 dy21 = _mm_sub_ps(iy2,jy1);
808 dz21 = _mm_sub_ps(iz2,jz1);
809 dx22 = _mm_sub_ps(ix2,jx2);
810 dy22 = _mm_sub_ps(iy2,jy2);
811 dz22 = _mm_sub_ps(iz2,jz2);
812 dx23 = _mm_sub_ps(ix2,jx3);
813 dy23 = _mm_sub_ps(iy2,jy3);
814 dz23 = _mm_sub_ps(iz2,jz3);
815 dx31 = _mm_sub_ps(ix3,jx1);
816 dy31 = _mm_sub_ps(iy3,jy1);
817 dz31 = _mm_sub_ps(iz3,jz1);
818 dx32 = _mm_sub_ps(ix3,jx2);
819 dy32 = _mm_sub_ps(iy3,jy2);
820 dz32 = _mm_sub_ps(iz3,jz2);
821 dx33 = _mm_sub_ps(ix3,jx3);
822 dy33 = _mm_sub_ps(iy3,jy3);
823 dz33 = _mm_sub_ps(iz3,jz3);
825 /* Calculate squared distance and things based on it */
826 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
827 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
828 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
829 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
830 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
831 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
832 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
833 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
834 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
835 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
837 rinv00 = gmx_mm_invsqrt_ps(rsq00);
838 rinv11 = gmx_mm_invsqrt_ps(rsq11);
839 rinv12 = gmx_mm_invsqrt_ps(rsq12);
840 rinv13 = gmx_mm_invsqrt_ps(rsq13);
841 rinv21 = gmx_mm_invsqrt_ps(rsq21);
842 rinv22 = gmx_mm_invsqrt_ps(rsq22);
843 rinv23 = gmx_mm_invsqrt_ps(rsq23);
844 rinv31 = gmx_mm_invsqrt_ps(rsq31);
845 rinv32 = gmx_mm_invsqrt_ps(rsq32);
846 rinv33 = gmx_mm_invsqrt_ps(rsq33);
848 fjx0 = _mm_setzero_ps();
849 fjy0 = _mm_setzero_ps();
850 fjz0 = _mm_setzero_ps();
851 fjx1 = _mm_setzero_ps();
852 fjy1 = _mm_setzero_ps();
853 fjz1 = _mm_setzero_ps();
854 fjx2 = _mm_setzero_ps();
855 fjy2 = _mm_setzero_ps();
856 fjz2 = _mm_setzero_ps();
857 fjx3 = _mm_setzero_ps();
858 fjy3 = _mm_setzero_ps();
859 fjz3 = _mm_setzero_ps();
861 /**************************
862 * CALCULATE INTERACTIONS *
863 **************************/
865 r00 = _mm_mul_ps(rsq00,rinv00);
866 r00 = _mm_andnot_ps(dummy_mask,r00);
868 /* Calculate table index by multiplying r with table scale and truncate to integer */
869 rt = _mm_mul_ps(r00,vftabscale);
870 vfitab = _mm_cvttps_epi32(rt);
871 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
872 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
874 /* CUBIC SPLINE TABLE DISPERSION */
875 vfitab = _mm_add_epi32(vfitab,ifour);
876 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
877 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
878 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
879 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
880 _MM_TRANSPOSE4_PS(Y,F,G,H);
881 Heps = _mm_mul_ps(vfeps,H);
882 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
883 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
884 vvdw6 = _mm_mul_ps(c6_00,VV);
885 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
886 fvdw6 = _mm_mul_ps(c6_00,FF);
888 /* CUBIC SPLINE TABLE REPULSION */
889 vfitab = _mm_add_epi32(vfitab,ifour);
890 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
891 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
892 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
893 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
894 _MM_TRANSPOSE4_PS(Y,F,G,H);
895 Heps = _mm_mul_ps(vfeps,H);
896 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
897 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
898 vvdw12 = _mm_mul_ps(c12_00,VV);
899 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
900 fvdw12 = _mm_mul_ps(c12_00,FF);
901 vvdw = _mm_add_ps(vvdw12,vvdw6);
902 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
904 /* Update potential sum for this i atom from the interaction with this j atom. */
905 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
906 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
910 fscal = _mm_andnot_ps(dummy_mask,fscal);
912 /* Calculate temporary vectorial force */
913 tx = _mm_mul_ps(fscal,dx00);
914 ty = _mm_mul_ps(fscal,dy00);
915 tz = _mm_mul_ps(fscal,dz00);
917 /* Update vectorial force */
918 fix0 = _mm_add_ps(fix0,tx);
919 fiy0 = _mm_add_ps(fiy0,ty);
920 fiz0 = _mm_add_ps(fiz0,tz);
922 fjx0 = _mm_add_ps(fjx0,tx);
923 fjy0 = _mm_add_ps(fjy0,ty);
924 fjz0 = _mm_add_ps(fjz0,tz);
926 /**************************
927 * CALCULATE INTERACTIONS *
928 **************************/
930 r11 = _mm_mul_ps(rsq11,rinv11);
931 r11 = _mm_andnot_ps(dummy_mask,r11);
933 /* Calculate table index by multiplying r with table scale and truncate to integer */
934 rt = _mm_mul_ps(r11,vftabscale);
935 vfitab = _mm_cvttps_epi32(rt);
936 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
937 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
939 /* CUBIC SPLINE TABLE ELECTROSTATICS */
940 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
941 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
942 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
943 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
944 _MM_TRANSPOSE4_PS(Y,F,G,H);
945 Heps = _mm_mul_ps(vfeps,H);
946 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
947 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
948 velec = _mm_mul_ps(qq11,VV);
949 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
950 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
952 /* Update potential sum for this i atom from the interaction with this j atom. */
953 velec = _mm_andnot_ps(dummy_mask,velec);
954 velecsum = _mm_add_ps(velecsum,velec);
958 fscal = _mm_andnot_ps(dummy_mask,fscal);
960 /* Calculate temporary vectorial force */
961 tx = _mm_mul_ps(fscal,dx11);
962 ty = _mm_mul_ps(fscal,dy11);
963 tz = _mm_mul_ps(fscal,dz11);
965 /* Update vectorial force */
966 fix1 = _mm_add_ps(fix1,tx);
967 fiy1 = _mm_add_ps(fiy1,ty);
968 fiz1 = _mm_add_ps(fiz1,tz);
970 fjx1 = _mm_add_ps(fjx1,tx);
971 fjy1 = _mm_add_ps(fjy1,ty);
972 fjz1 = _mm_add_ps(fjz1,tz);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 r12 = _mm_mul_ps(rsq12,rinv12);
979 r12 = _mm_andnot_ps(dummy_mask,r12);
981 /* Calculate table index by multiplying r with table scale and truncate to integer */
982 rt = _mm_mul_ps(r12,vftabscale);
983 vfitab = _mm_cvttps_epi32(rt);
984 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
985 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
987 /* CUBIC SPLINE TABLE ELECTROSTATICS */
988 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
989 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
990 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
991 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
992 _MM_TRANSPOSE4_PS(Y,F,G,H);
993 Heps = _mm_mul_ps(vfeps,H);
994 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
995 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
996 velec = _mm_mul_ps(qq12,VV);
997 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
998 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1000 /* Update potential sum for this i atom from the interaction with this j atom. */
1001 velec = _mm_andnot_ps(dummy_mask,velec);
1002 velecsum = _mm_add_ps(velecsum,velec);
1006 fscal = _mm_andnot_ps(dummy_mask,fscal);
1008 /* Calculate temporary vectorial force */
1009 tx = _mm_mul_ps(fscal,dx12);
1010 ty = _mm_mul_ps(fscal,dy12);
1011 tz = _mm_mul_ps(fscal,dz12);
1013 /* Update vectorial force */
1014 fix1 = _mm_add_ps(fix1,tx);
1015 fiy1 = _mm_add_ps(fiy1,ty);
1016 fiz1 = _mm_add_ps(fiz1,tz);
1018 fjx2 = _mm_add_ps(fjx2,tx);
1019 fjy2 = _mm_add_ps(fjy2,ty);
1020 fjz2 = _mm_add_ps(fjz2,tz);
1022 /**************************
1023 * CALCULATE INTERACTIONS *
1024 **************************/
1026 r13 = _mm_mul_ps(rsq13,rinv13);
1027 r13 = _mm_andnot_ps(dummy_mask,r13);
1029 /* Calculate table index by multiplying r with table scale and truncate to integer */
1030 rt = _mm_mul_ps(r13,vftabscale);
1031 vfitab = _mm_cvttps_epi32(rt);
1032 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1033 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1035 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1036 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1037 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1038 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1039 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1040 _MM_TRANSPOSE4_PS(Y,F,G,H);
1041 Heps = _mm_mul_ps(vfeps,H);
1042 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1043 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1044 velec = _mm_mul_ps(qq13,VV);
1045 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1046 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm_add_ps(velecsum,velec);
1054 fscal = _mm_andnot_ps(dummy_mask,fscal);
1056 /* Calculate temporary vectorial force */
1057 tx = _mm_mul_ps(fscal,dx13);
1058 ty = _mm_mul_ps(fscal,dy13);
1059 tz = _mm_mul_ps(fscal,dz13);
1061 /* Update vectorial force */
1062 fix1 = _mm_add_ps(fix1,tx);
1063 fiy1 = _mm_add_ps(fiy1,ty);
1064 fiz1 = _mm_add_ps(fiz1,tz);
1066 fjx3 = _mm_add_ps(fjx3,tx);
1067 fjy3 = _mm_add_ps(fjy3,ty);
1068 fjz3 = _mm_add_ps(fjz3,tz);
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 r21 = _mm_mul_ps(rsq21,rinv21);
1075 r21 = _mm_andnot_ps(dummy_mask,r21);
1077 /* Calculate table index by multiplying r with table scale and truncate to integer */
1078 rt = _mm_mul_ps(r21,vftabscale);
1079 vfitab = _mm_cvttps_epi32(rt);
1080 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1081 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1083 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1084 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1085 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1086 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1087 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1088 _MM_TRANSPOSE4_PS(Y,F,G,H);
1089 Heps = _mm_mul_ps(vfeps,H);
1090 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1091 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1092 velec = _mm_mul_ps(qq21,VV);
1093 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1094 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1096 /* Update potential sum for this i atom from the interaction with this j atom. */
1097 velec = _mm_andnot_ps(dummy_mask,velec);
1098 velecsum = _mm_add_ps(velecsum,velec);
1102 fscal = _mm_andnot_ps(dummy_mask,fscal);
1104 /* Calculate temporary vectorial force */
1105 tx = _mm_mul_ps(fscal,dx21);
1106 ty = _mm_mul_ps(fscal,dy21);
1107 tz = _mm_mul_ps(fscal,dz21);
1109 /* Update vectorial force */
1110 fix2 = _mm_add_ps(fix2,tx);
1111 fiy2 = _mm_add_ps(fiy2,ty);
1112 fiz2 = _mm_add_ps(fiz2,tz);
1114 fjx1 = _mm_add_ps(fjx1,tx);
1115 fjy1 = _mm_add_ps(fjy1,ty);
1116 fjz1 = _mm_add_ps(fjz1,tz);
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 r22 = _mm_mul_ps(rsq22,rinv22);
1123 r22 = _mm_andnot_ps(dummy_mask,r22);
1125 /* Calculate table index by multiplying r with table scale and truncate to integer */
1126 rt = _mm_mul_ps(r22,vftabscale);
1127 vfitab = _mm_cvttps_epi32(rt);
1128 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1129 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1131 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1132 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1133 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1134 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1135 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1136 _MM_TRANSPOSE4_PS(Y,F,G,H);
1137 Heps = _mm_mul_ps(vfeps,H);
1138 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1139 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1140 velec = _mm_mul_ps(qq22,VV);
1141 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1142 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1144 /* Update potential sum for this i atom from the interaction with this j atom. */
1145 velec = _mm_andnot_ps(dummy_mask,velec);
1146 velecsum = _mm_add_ps(velecsum,velec);
1150 fscal = _mm_andnot_ps(dummy_mask,fscal);
1152 /* Calculate temporary vectorial force */
1153 tx = _mm_mul_ps(fscal,dx22);
1154 ty = _mm_mul_ps(fscal,dy22);
1155 tz = _mm_mul_ps(fscal,dz22);
1157 /* Update vectorial force */
1158 fix2 = _mm_add_ps(fix2,tx);
1159 fiy2 = _mm_add_ps(fiy2,ty);
1160 fiz2 = _mm_add_ps(fiz2,tz);
1162 fjx2 = _mm_add_ps(fjx2,tx);
1163 fjy2 = _mm_add_ps(fjy2,ty);
1164 fjz2 = _mm_add_ps(fjz2,tz);
1166 /**************************
1167 * CALCULATE INTERACTIONS *
1168 **************************/
1170 r23 = _mm_mul_ps(rsq23,rinv23);
1171 r23 = _mm_andnot_ps(dummy_mask,r23);
1173 /* Calculate table index by multiplying r with table scale and truncate to integer */
1174 rt = _mm_mul_ps(r23,vftabscale);
1175 vfitab = _mm_cvttps_epi32(rt);
1176 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1177 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1179 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1180 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1181 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1182 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1183 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1184 _MM_TRANSPOSE4_PS(Y,F,G,H);
1185 Heps = _mm_mul_ps(vfeps,H);
1186 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1187 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1188 velec = _mm_mul_ps(qq23,VV);
1189 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1190 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1192 /* Update potential sum for this i atom from the interaction with this j atom. */
1193 velec = _mm_andnot_ps(dummy_mask,velec);
1194 velecsum = _mm_add_ps(velecsum,velec);
1198 fscal = _mm_andnot_ps(dummy_mask,fscal);
1200 /* Calculate temporary vectorial force */
1201 tx = _mm_mul_ps(fscal,dx23);
1202 ty = _mm_mul_ps(fscal,dy23);
1203 tz = _mm_mul_ps(fscal,dz23);
1205 /* Update vectorial force */
1206 fix2 = _mm_add_ps(fix2,tx);
1207 fiy2 = _mm_add_ps(fiy2,ty);
1208 fiz2 = _mm_add_ps(fiz2,tz);
1210 fjx3 = _mm_add_ps(fjx3,tx);
1211 fjy3 = _mm_add_ps(fjy3,ty);
1212 fjz3 = _mm_add_ps(fjz3,tz);
1214 /**************************
1215 * CALCULATE INTERACTIONS *
1216 **************************/
1218 r31 = _mm_mul_ps(rsq31,rinv31);
1219 r31 = _mm_andnot_ps(dummy_mask,r31);
1221 /* Calculate table index by multiplying r with table scale and truncate to integer */
1222 rt = _mm_mul_ps(r31,vftabscale);
1223 vfitab = _mm_cvttps_epi32(rt);
1224 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1225 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1227 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1228 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1229 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1230 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1231 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1232 _MM_TRANSPOSE4_PS(Y,F,G,H);
1233 Heps = _mm_mul_ps(vfeps,H);
1234 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1235 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1236 velec = _mm_mul_ps(qq31,VV);
1237 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1238 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1240 /* Update potential sum for this i atom from the interaction with this j atom. */
1241 velec = _mm_andnot_ps(dummy_mask,velec);
1242 velecsum = _mm_add_ps(velecsum,velec);
1246 fscal = _mm_andnot_ps(dummy_mask,fscal);
1248 /* Calculate temporary vectorial force */
1249 tx = _mm_mul_ps(fscal,dx31);
1250 ty = _mm_mul_ps(fscal,dy31);
1251 tz = _mm_mul_ps(fscal,dz31);
1253 /* Update vectorial force */
1254 fix3 = _mm_add_ps(fix3,tx);
1255 fiy3 = _mm_add_ps(fiy3,ty);
1256 fiz3 = _mm_add_ps(fiz3,tz);
1258 fjx1 = _mm_add_ps(fjx1,tx);
1259 fjy1 = _mm_add_ps(fjy1,ty);
1260 fjz1 = _mm_add_ps(fjz1,tz);
1262 /**************************
1263 * CALCULATE INTERACTIONS *
1264 **************************/
1266 r32 = _mm_mul_ps(rsq32,rinv32);
1267 r32 = _mm_andnot_ps(dummy_mask,r32);
1269 /* Calculate table index by multiplying r with table scale and truncate to integer */
1270 rt = _mm_mul_ps(r32,vftabscale);
1271 vfitab = _mm_cvttps_epi32(rt);
1272 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1273 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1275 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1276 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1277 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1278 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1279 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1280 _MM_TRANSPOSE4_PS(Y,F,G,H);
1281 Heps = _mm_mul_ps(vfeps,H);
1282 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1283 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1284 velec = _mm_mul_ps(qq32,VV);
1285 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1286 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1288 /* Update potential sum for this i atom from the interaction with this j atom. */
1289 velec = _mm_andnot_ps(dummy_mask,velec);
1290 velecsum = _mm_add_ps(velecsum,velec);
1294 fscal = _mm_andnot_ps(dummy_mask,fscal);
1296 /* Calculate temporary vectorial force */
1297 tx = _mm_mul_ps(fscal,dx32);
1298 ty = _mm_mul_ps(fscal,dy32);
1299 tz = _mm_mul_ps(fscal,dz32);
1301 /* Update vectorial force */
1302 fix3 = _mm_add_ps(fix3,tx);
1303 fiy3 = _mm_add_ps(fiy3,ty);
1304 fiz3 = _mm_add_ps(fiz3,tz);
1306 fjx2 = _mm_add_ps(fjx2,tx);
1307 fjy2 = _mm_add_ps(fjy2,ty);
1308 fjz2 = _mm_add_ps(fjz2,tz);
1310 /**************************
1311 * CALCULATE INTERACTIONS *
1312 **************************/
1314 r33 = _mm_mul_ps(rsq33,rinv33);
1315 r33 = _mm_andnot_ps(dummy_mask,r33);
1317 /* Calculate table index by multiplying r with table scale and truncate to integer */
1318 rt = _mm_mul_ps(r33,vftabscale);
1319 vfitab = _mm_cvttps_epi32(rt);
1320 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1321 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1323 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1324 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1325 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1326 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1327 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1328 _MM_TRANSPOSE4_PS(Y,F,G,H);
1329 Heps = _mm_mul_ps(vfeps,H);
1330 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1331 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1332 velec = _mm_mul_ps(qq33,VV);
1333 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1334 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1336 /* Update potential sum for this i atom from the interaction with this j atom. */
1337 velec = _mm_andnot_ps(dummy_mask,velec);
1338 velecsum = _mm_add_ps(velecsum,velec);
1342 fscal = _mm_andnot_ps(dummy_mask,fscal);
1344 /* Calculate temporary vectorial force */
1345 tx = _mm_mul_ps(fscal,dx33);
1346 ty = _mm_mul_ps(fscal,dy33);
1347 tz = _mm_mul_ps(fscal,dz33);
1349 /* Update vectorial force */
1350 fix3 = _mm_add_ps(fix3,tx);
1351 fiy3 = _mm_add_ps(fiy3,ty);
1352 fiz3 = _mm_add_ps(fiz3,tz);
1354 fjx3 = _mm_add_ps(fjx3,tx);
1355 fjy3 = _mm_add_ps(fjy3,ty);
1356 fjz3 = _mm_add_ps(fjz3,tz);
1358 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1359 f+j_coord_offsetC,f+j_coord_offsetD,
1360 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1361 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1363 /* Inner loop uses 456 flops */
1366 /* End of innermost loop */
1368 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1369 f+i_coord_offset,fshift+i_shift_offset);
1372 /* Update potential energies */
1373 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1374 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1376 /* Increment number of inner iterations */
1377 inneriter += j_index_end - j_index_start;
1379 /* Outer loop uses 38 flops */
1382 /* Increment number of outer iterations */
1385 /* Update outer/inner flops */
1387 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*456);
1390 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
1391 * Electrostatics interaction: CubicSplineTable
1392 * VdW interaction: CubicSplineTable
1393 * Geometry: Water4-Water4
1394 * Calculate force/pot: Force
1397 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_sse2_single
1398 (t_nblist * gmx_restrict nlist,
1399 rvec * gmx_restrict xx,
1400 rvec * gmx_restrict ff,
1401 t_forcerec * gmx_restrict fr,
1402 t_mdatoms * gmx_restrict mdatoms,
1403 nb_kernel_data_t * gmx_restrict kernel_data,
1404 t_nrnb * gmx_restrict nrnb)
1406 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1407 * just 0 for non-waters.
1408 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1409 * jnr indices corresponding to data put in the four positions in the SIMD register.
1411 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1412 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1413 int jnrA,jnrB,jnrC,jnrD;
1414 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1415 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1416 real shX,shY,shZ,rcutoff_scalar;
1417 real *shiftvec,*fshift,*x,*f;
1418 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1420 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1422 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1424 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1426 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1427 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1428 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1429 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1430 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1431 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1432 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1433 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1434 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1435 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1436 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1437 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1438 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1439 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1440 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1441 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1442 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1443 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1444 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1445 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1448 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1451 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1452 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1454 __m128i ifour = _mm_set1_epi32(4);
1455 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1457 __m128 dummy_mask,cutoff_mask;
1458 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1459 __m128 one = _mm_set1_ps(1.0);
1460 __m128 two = _mm_set1_ps(2.0);
1466 jindex = nlist->jindex;
1468 shiftidx = nlist->shift;
1470 shiftvec = fr->shift_vec[0];
1471 fshift = fr->fshift[0];
1472 facel = _mm_set1_ps(fr->epsfac);
1473 charge = mdatoms->chargeA;
1474 nvdwtype = fr->ntype;
1475 vdwparam = fr->nbfp;
1476 vdwtype = mdatoms->typeA;
1478 vftab = kernel_data->table_elec_vdw->data;
1479 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1481 /* Setup water-specific parameters */
1482 inr = nlist->iinr[0];
1483 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1484 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1485 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1486 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1488 jq1 = _mm_set1_ps(charge[inr+1]);
1489 jq2 = _mm_set1_ps(charge[inr+2]);
1490 jq3 = _mm_set1_ps(charge[inr+3]);
1491 vdwjidx0A = 2*vdwtype[inr+0];
1492 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1493 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1494 qq11 = _mm_mul_ps(iq1,jq1);
1495 qq12 = _mm_mul_ps(iq1,jq2);
1496 qq13 = _mm_mul_ps(iq1,jq3);
1497 qq21 = _mm_mul_ps(iq2,jq1);
1498 qq22 = _mm_mul_ps(iq2,jq2);
1499 qq23 = _mm_mul_ps(iq2,jq3);
1500 qq31 = _mm_mul_ps(iq3,jq1);
1501 qq32 = _mm_mul_ps(iq3,jq2);
1502 qq33 = _mm_mul_ps(iq3,jq3);
1504 /* Avoid stupid compiler warnings */
1505 jnrA = jnrB = jnrC = jnrD = 0;
1506 j_coord_offsetA = 0;
1507 j_coord_offsetB = 0;
1508 j_coord_offsetC = 0;
1509 j_coord_offsetD = 0;
1514 /* Start outer loop over neighborlists */
1515 for(iidx=0; iidx<nri; iidx++)
1517 /* Load shift vector for this list */
1518 i_shift_offset = DIM*shiftidx[iidx];
1519 shX = shiftvec[i_shift_offset+XX];
1520 shY = shiftvec[i_shift_offset+YY];
1521 shZ = shiftvec[i_shift_offset+ZZ];
1523 /* Load limits for loop over neighbors */
1524 j_index_start = jindex[iidx];
1525 j_index_end = jindex[iidx+1];
1527 /* Get outer coordinate index */
1529 i_coord_offset = DIM*inr;
1531 /* Load i particle coords and add shift vector */
1532 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1533 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1534 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1535 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1536 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1537 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1538 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1539 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1540 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1541 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1542 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1543 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1545 fix0 = _mm_setzero_ps();
1546 fiy0 = _mm_setzero_ps();
1547 fiz0 = _mm_setzero_ps();
1548 fix1 = _mm_setzero_ps();
1549 fiy1 = _mm_setzero_ps();
1550 fiz1 = _mm_setzero_ps();
1551 fix2 = _mm_setzero_ps();
1552 fiy2 = _mm_setzero_ps();
1553 fiz2 = _mm_setzero_ps();
1554 fix3 = _mm_setzero_ps();
1555 fiy3 = _mm_setzero_ps();
1556 fiz3 = _mm_setzero_ps();
1558 /* Start inner kernel loop */
1559 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1562 /* Get j neighbor index, and coordinate index */
1564 jnrB = jjnr[jidx+1];
1565 jnrC = jjnr[jidx+2];
1566 jnrD = jjnr[jidx+3];
1568 j_coord_offsetA = DIM*jnrA;
1569 j_coord_offsetB = DIM*jnrB;
1570 j_coord_offsetC = DIM*jnrC;
1571 j_coord_offsetD = DIM*jnrD;
1573 /* load j atom coordinates */
1574 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1575 x+j_coord_offsetC,x+j_coord_offsetD,
1576 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1577 &jy2,&jz2,&jx3,&jy3,&jz3);
1579 /* Calculate displacement vector */
1580 dx00 = _mm_sub_ps(ix0,jx0);
1581 dy00 = _mm_sub_ps(iy0,jy0);
1582 dz00 = _mm_sub_ps(iz0,jz0);
1583 dx11 = _mm_sub_ps(ix1,jx1);
1584 dy11 = _mm_sub_ps(iy1,jy1);
1585 dz11 = _mm_sub_ps(iz1,jz1);
1586 dx12 = _mm_sub_ps(ix1,jx2);
1587 dy12 = _mm_sub_ps(iy1,jy2);
1588 dz12 = _mm_sub_ps(iz1,jz2);
1589 dx13 = _mm_sub_ps(ix1,jx3);
1590 dy13 = _mm_sub_ps(iy1,jy3);
1591 dz13 = _mm_sub_ps(iz1,jz3);
1592 dx21 = _mm_sub_ps(ix2,jx1);
1593 dy21 = _mm_sub_ps(iy2,jy1);
1594 dz21 = _mm_sub_ps(iz2,jz1);
1595 dx22 = _mm_sub_ps(ix2,jx2);
1596 dy22 = _mm_sub_ps(iy2,jy2);
1597 dz22 = _mm_sub_ps(iz2,jz2);
1598 dx23 = _mm_sub_ps(ix2,jx3);
1599 dy23 = _mm_sub_ps(iy2,jy3);
1600 dz23 = _mm_sub_ps(iz2,jz3);
1601 dx31 = _mm_sub_ps(ix3,jx1);
1602 dy31 = _mm_sub_ps(iy3,jy1);
1603 dz31 = _mm_sub_ps(iz3,jz1);
1604 dx32 = _mm_sub_ps(ix3,jx2);
1605 dy32 = _mm_sub_ps(iy3,jy2);
1606 dz32 = _mm_sub_ps(iz3,jz2);
1607 dx33 = _mm_sub_ps(ix3,jx3);
1608 dy33 = _mm_sub_ps(iy3,jy3);
1609 dz33 = _mm_sub_ps(iz3,jz3);
1611 /* Calculate squared distance and things based on it */
1612 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1613 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1614 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1615 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1616 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1617 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1618 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1619 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1620 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1621 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1623 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1624 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1625 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1626 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1627 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1628 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1629 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1630 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1631 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1632 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1634 fjx0 = _mm_setzero_ps();
1635 fjy0 = _mm_setzero_ps();
1636 fjz0 = _mm_setzero_ps();
1637 fjx1 = _mm_setzero_ps();
1638 fjy1 = _mm_setzero_ps();
1639 fjz1 = _mm_setzero_ps();
1640 fjx2 = _mm_setzero_ps();
1641 fjy2 = _mm_setzero_ps();
1642 fjz2 = _mm_setzero_ps();
1643 fjx3 = _mm_setzero_ps();
1644 fjy3 = _mm_setzero_ps();
1645 fjz3 = _mm_setzero_ps();
1647 /**************************
1648 * CALCULATE INTERACTIONS *
1649 **************************/
1651 r00 = _mm_mul_ps(rsq00,rinv00);
1653 /* Calculate table index by multiplying r with table scale and truncate to integer */
1654 rt = _mm_mul_ps(r00,vftabscale);
1655 vfitab = _mm_cvttps_epi32(rt);
1656 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1657 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1659 /* CUBIC SPLINE TABLE DISPERSION */
1660 vfitab = _mm_add_epi32(vfitab,ifour);
1661 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1662 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1663 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1664 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1665 _MM_TRANSPOSE4_PS(Y,F,G,H);
1666 Heps = _mm_mul_ps(vfeps,H);
1667 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1668 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1669 fvdw6 = _mm_mul_ps(c6_00,FF);
1671 /* CUBIC SPLINE TABLE REPULSION */
1672 vfitab = _mm_add_epi32(vfitab,ifour);
1673 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1674 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1675 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1676 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1677 _MM_TRANSPOSE4_PS(Y,F,G,H);
1678 Heps = _mm_mul_ps(vfeps,H);
1679 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1680 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1681 fvdw12 = _mm_mul_ps(c12_00,FF);
1682 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1686 /* Calculate temporary vectorial force */
1687 tx = _mm_mul_ps(fscal,dx00);
1688 ty = _mm_mul_ps(fscal,dy00);
1689 tz = _mm_mul_ps(fscal,dz00);
1691 /* Update vectorial force */
1692 fix0 = _mm_add_ps(fix0,tx);
1693 fiy0 = _mm_add_ps(fiy0,ty);
1694 fiz0 = _mm_add_ps(fiz0,tz);
1696 fjx0 = _mm_add_ps(fjx0,tx);
1697 fjy0 = _mm_add_ps(fjy0,ty);
1698 fjz0 = _mm_add_ps(fjz0,tz);
1700 /**************************
1701 * CALCULATE INTERACTIONS *
1702 **************************/
1704 r11 = _mm_mul_ps(rsq11,rinv11);
1706 /* Calculate table index by multiplying r with table scale and truncate to integer */
1707 rt = _mm_mul_ps(r11,vftabscale);
1708 vfitab = _mm_cvttps_epi32(rt);
1709 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1710 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1712 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1713 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1714 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1715 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1716 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1717 _MM_TRANSPOSE4_PS(Y,F,G,H);
1718 Heps = _mm_mul_ps(vfeps,H);
1719 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1720 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1721 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1725 /* Calculate temporary vectorial force */
1726 tx = _mm_mul_ps(fscal,dx11);
1727 ty = _mm_mul_ps(fscal,dy11);
1728 tz = _mm_mul_ps(fscal,dz11);
1730 /* Update vectorial force */
1731 fix1 = _mm_add_ps(fix1,tx);
1732 fiy1 = _mm_add_ps(fiy1,ty);
1733 fiz1 = _mm_add_ps(fiz1,tz);
1735 fjx1 = _mm_add_ps(fjx1,tx);
1736 fjy1 = _mm_add_ps(fjy1,ty);
1737 fjz1 = _mm_add_ps(fjz1,tz);
1739 /**************************
1740 * CALCULATE INTERACTIONS *
1741 **************************/
1743 r12 = _mm_mul_ps(rsq12,rinv12);
1745 /* Calculate table index by multiplying r with table scale and truncate to integer */
1746 rt = _mm_mul_ps(r12,vftabscale);
1747 vfitab = _mm_cvttps_epi32(rt);
1748 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1749 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1751 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1752 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1753 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1754 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1755 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1756 _MM_TRANSPOSE4_PS(Y,F,G,H);
1757 Heps = _mm_mul_ps(vfeps,H);
1758 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1759 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1760 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1764 /* Calculate temporary vectorial force */
1765 tx = _mm_mul_ps(fscal,dx12);
1766 ty = _mm_mul_ps(fscal,dy12);
1767 tz = _mm_mul_ps(fscal,dz12);
1769 /* Update vectorial force */
1770 fix1 = _mm_add_ps(fix1,tx);
1771 fiy1 = _mm_add_ps(fiy1,ty);
1772 fiz1 = _mm_add_ps(fiz1,tz);
1774 fjx2 = _mm_add_ps(fjx2,tx);
1775 fjy2 = _mm_add_ps(fjy2,ty);
1776 fjz2 = _mm_add_ps(fjz2,tz);
1778 /**************************
1779 * CALCULATE INTERACTIONS *
1780 **************************/
1782 r13 = _mm_mul_ps(rsq13,rinv13);
1784 /* Calculate table index by multiplying r with table scale and truncate to integer */
1785 rt = _mm_mul_ps(r13,vftabscale);
1786 vfitab = _mm_cvttps_epi32(rt);
1787 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1788 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1790 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1791 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1792 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1793 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1794 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1795 _MM_TRANSPOSE4_PS(Y,F,G,H);
1796 Heps = _mm_mul_ps(vfeps,H);
1797 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1798 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1799 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1803 /* Calculate temporary vectorial force */
1804 tx = _mm_mul_ps(fscal,dx13);
1805 ty = _mm_mul_ps(fscal,dy13);
1806 tz = _mm_mul_ps(fscal,dz13);
1808 /* Update vectorial force */
1809 fix1 = _mm_add_ps(fix1,tx);
1810 fiy1 = _mm_add_ps(fiy1,ty);
1811 fiz1 = _mm_add_ps(fiz1,tz);
1813 fjx3 = _mm_add_ps(fjx3,tx);
1814 fjy3 = _mm_add_ps(fjy3,ty);
1815 fjz3 = _mm_add_ps(fjz3,tz);
1817 /**************************
1818 * CALCULATE INTERACTIONS *
1819 **************************/
1821 r21 = _mm_mul_ps(rsq21,rinv21);
1823 /* Calculate table index by multiplying r with table scale and truncate to integer */
1824 rt = _mm_mul_ps(r21,vftabscale);
1825 vfitab = _mm_cvttps_epi32(rt);
1826 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1827 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1829 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1830 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1831 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1832 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1833 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1834 _MM_TRANSPOSE4_PS(Y,F,G,H);
1835 Heps = _mm_mul_ps(vfeps,H);
1836 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1837 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1838 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1842 /* Calculate temporary vectorial force */
1843 tx = _mm_mul_ps(fscal,dx21);
1844 ty = _mm_mul_ps(fscal,dy21);
1845 tz = _mm_mul_ps(fscal,dz21);
1847 /* Update vectorial force */
1848 fix2 = _mm_add_ps(fix2,tx);
1849 fiy2 = _mm_add_ps(fiy2,ty);
1850 fiz2 = _mm_add_ps(fiz2,tz);
1852 fjx1 = _mm_add_ps(fjx1,tx);
1853 fjy1 = _mm_add_ps(fjy1,ty);
1854 fjz1 = _mm_add_ps(fjz1,tz);
1856 /**************************
1857 * CALCULATE INTERACTIONS *
1858 **************************/
1860 r22 = _mm_mul_ps(rsq22,rinv22);
1862 /* Calculate table index by multiplying r with table scale and truncate to integer */
1863 rt = _mm_mul_ps(r22,vftabscale);
1864 vfitab = _mm_cvttps_epi32(rt);
1865 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1866 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1868 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1869 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1870 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1871 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1872 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1873 _MM_TRANSPOSE4_PS(Y,F,G,H);
1874 Heps = _mm_mul_ps(vfeps,H);
1875 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1876 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1877 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1881 /* Calculate temporary vectorial force */
1882 tx = _mm_mul_ps(fscal,dx22);
1883 ty = _mm_mul_ps(fscal,dy22);
1884 tz = _mm_mul_ps(fscal,dz22);
1886 /* Update vectorial force */
1887 fix2 = _mm_add_ps(fix2,tx);
1888 fiy2 = _mm_add_ps(fiy2,ty);
1889 fiz2 = _mm_add_ps(fiz2,tz);
1891 fjx2 = _mm_add_ps(fjx2,tx);
1892 fjy2 = _mm_add_ps(fjy2,ty);
1893 fjz2 = _mm_add_ps(fjz2,tz);
1895 /**************************
1896 * CALCULATE INTERACTIONS *
1897 **************************/
1899 r23 = _mm_mul_ps(rsq23,rinv23);
1901 /* Calculate table index by multiplying r with table scale and truncate to integer */
1902 rt = _mm_mul_ps(r23,vftabscale);
1903 vfitab = _mm_cvttps_epi32(rt);
1904 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1905 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1907 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1908 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1909 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1910 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1911 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1912 _MM_TRANSPOSE4_PS(Y,F,G,H);
1913 Heps = _mm_mul_ps(vfeps,H);
1914 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1915 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1916 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1920 /* Calculate temporary vectorial force */
1921 tx = _mm_mul_ps(fscal,dx23);
1922 ty = _mm_mul_ps(fscal,dy23);
1923 tz = _mm_mul_ps(fscal,dz23);
1925 /* Update vectorial force */
1926 fix2 = _mm_add_ps(fix2,tx);
1927 fiy2 = _mm_add_ps(fiy2,ty);
1928 fiz2 = _mm_add_ps(fiz2,tz);
1930 fjx3 = _mm_add_ps(fjx3,tx);
1931 fjy3 = _mm_add_ps(fjy3,ty);
1932 fjz3 = _mm_add_ps(fjz3,tz);
1934 /**************************
1935 * CALCULATE INTERACTIONS *
1936 **************************/
1938 r31 = _mm_mul_ps(rsq31,rinv31);
1940 /* Calculate table index by multiplying r with table scale and truncate to integer */
1941 rt = _mm_mul_ps(r31,vftabscale);
1942 vfitab = _mm_cvttps_epi32(rt);
1943 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1944 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1946 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1947 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1948 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1949 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1950 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1951 _MM_TRANSPOSE4_PS(Y,F,G,H);
1952 Heps = _mm_mul_ps(vfeps,H);
1953 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1954 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1955 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1959 /* Calculate temporary vectorial force */
1960 tx = _mm_mul_ps(fscal,dx31);
1961 ty = _mm_mul_ps(fscal,dy31);
1962 tz = _mm_mul_ps(fscal,dz31);
1964 /* Update vectorial force */
1965 fix3 = _mm_add_ps(fix3,tx);
1966 fiy3 = _mm_add_ps(fiy3,ty);
1967 fiz3 = _mm_add_ps(fiz3,tz);
1969 fjx1 = _mm_add_ps(fjx1,tx);
1970 fjy1 = _mm_add_ps(fjy1,ty);
1971 fjz1 = _mm_add_ps(fjz1,tz);
1973 /**************************
1974 * CALCULATE INTERACTIONS *
1975 **************************/
1977 r32 = _mm_mul_ps(rsq32,rinv32);
1979 /* Calculate table index by multiplying r with table scale and truncate to integer */
1980 rt = _mm_mul_ps(r32,vftabscale);
1981 vfitab = _mm_cvttps_epi32(rt);
1982 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1983 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1985 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1986 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1987 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1988 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1989 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1990 _MM_TRANSPOSE4_PS(Y,F,G,H);
1991 Heps = _mm_mul_ps(vfeps,H);
1992 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1993 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1994 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1998 /* Calculate temporary vectorial force */
1999 tx = _mm_mul_ps(fscal,dx32);
2000 ty = _mm_mul_ps(fscal,dy32);
2001 tz = _mm_mul_ps(fscal,dz32);
2003 /* Update vectorial force */
2004 fix3 = _mm_add_ps(fix3,tx);
2005 fiy3 = _mm_add_ps(fiy3,ty);
2006 fiz3 = _mm_add_ps(fiz3,tz);
2008 fjx2 = _mm_add_ps(fjx2,tx);
2009 fjy2 = _mm_add_ps(fjy2,ty);
2010 fjz2 = _mm_add_ps(fjz2,tz);
2012 /**************************
2013 * CALCULATE INTERACTIONS *
2014 **************************/
2016 r33 = _mm_mul_ps(rsq33,rinv33);
2018 /* Calculate table index by multiplying r with table scale and truncate to integer */
2019 rt = _mm_mul_ps(r33,vftabscale);
2020 vfitab = _mm_cvttps_epi32(rt);
2021 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2022 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2024 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2025 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2026 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2027 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2028 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2029 _MM_TRANSPOSE4_PS(Y,F,G,H);
2030 Heps = _mm_mul_ps(vfeps,H);
2031 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2032 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2033 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2037 /* Calculate temporary vectorial force */
2038 tx = _mm_mul_ps(fscal,dx33);
2039 ty = _mm_mul_ps(fscal,dy33);
2040 tz = _mm_mul_ps(fscal,dz33);
2042 /* Update vectorial force */
2043 fix3 = _mm_add_ps(fix3,tx);
2044 fiy3 = _mm_add_ps(fiy3,ty);
2045 fiz3 = _mm_add_ps(fiz3,tz);
2047 fjx3 = _mm_add_ps(fjx3,tx);
2048 fjy3 = _mm_add_ps(fjy3,ty);
2049 fjz3 = _mm_add_ps(fjz3,tz);
2051 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2052 f+j_coord_offsetC,f+j_coord_offsetD,
2053 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2054 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2056 /* Inner loop uses 402 flops */
2059 if(jidx<j_index_end)
2062 /* Get j neighbor index, and coordinate index */
2064 jnrB = jjnr[jidx+1];
2065 jnrC = jjnr[jidx+2];
2066 jnrD = jjnr[jidx+3];
2068 /* Sign of each element will be negative for non-real atoms.
2069 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2070 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2072 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2073 jnrA = (jnrA>=0) ? jnrA : 0;
2074 jnrB = (jnrB>=0) ? jnrB : 0;
2075 jnrC = (jnrC>=0) ? jnrC : 0;
2076 jnrD = (jnrD>=0) ? jnrD : 0;
2078 j_coord_offsetA = DIM*jnrA;
2079 j_coord_offsetB = DIM*jnrB;
2080 j_coord_offsetC = DIM*jnrC;
2081 j_coord_offsetD = DIM*jnrD;
2083 /* load j atom coordinates */
2084 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2085 x+j_coord_offsetC,x+j_coord_offsetD,
2086 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2087 &jy2,&jz2,&jx3,&jy3,&jz3);
2089 /* Calculate displacement vector */
2090 dx00 = _mm_sub_ps(ix0,jx0);
2091 dy00 = _mm_sub_ps(iy0,jy0);
2092 dz00 = _mm_sub_ps(iz0,jz0);
2093 dx11 = _mm_sub_ps(ix1,jx1);
2094 dy11 = _mm_sub_ps(iy1,jy1);
2095 dz11 = _mm_sub_ps(iz1,jz1);
2096 dx12 = _mm_sub_ps(ix1,jx2);
2097 dy12 = _mm_sub_ps(iy1,jy2);
2098 dz12 = _mm_sub_ps(iz1,jz2);
2099 dx13 = _mm_sub_ps(ix1,jx3);
2100 dy13 = _mm_sub_ps(iy1,jy3);
2101 dz13 = _mm_sub_ps(iz1,jz3);
2102 dx21 = _mm_sub_ps(ix2,jx1);
2103 dy21 = _mm_sub_ps(iy2,jy1);
2104 dz21 = _mm_sub_ps(iz2,jz1);
2105 dx22 = _mm_sub_ps(ix2,jx2);
2106 dy22 = _mm_sub_ps(iy2,jy2);
2107 dz22 = _mm_sub_ps(iz2,jz2);
2108 dx23 = _mm_sub_ps(ix2,jx3);
2109 dy23 = _mm_sub_ps(iy2,jy3);
2110 dz23 = _mm_sub_ps(iz2,jz3);
2111 dx31 = _mm_sub_ps(ix3,jx1);
2112 dy31 = _mm_sub_ps(iy3,jy1);
2113 dz31 = _mm_sub_ps(iz3,jz1);
2114 dx32 = _mm_sub_ps(ix3,jx2);
2115 dy32 = _mm_sub_ps(iy3,jy2);
2116 dz32 = _mm_sub_ps(iz3,jz2);
2117 dx33 = _mm_sub_ps(ix3,jx3);
2118 dy33 = _mm_sub_ps(iy3,jy3);
2119 dz33 = _mm_sub_ps(iz3,jz3);
2121 /* Calculate squared distance and things based on it */
2122 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2123 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2124 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2125 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2126 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2127 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2128 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2129 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2130 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2131 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2133 rinv00 = gmx_mm_invsqrt_ps(rsq00);
2134 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2135 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2136 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2137 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2138 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2139 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2140 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2141 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2142 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2144 fjx0 = _mm_setzero_ps();
2145 fjy0 = _mm_setzero_ps();
2146 fjz0 = _mm_setzero_ps();
2147 fjx1 = _mm_setzero_ps();
2148 fjy1 = _mm_setzero_ps();
2149 fjz1 = _mm_setzero_ps();
2150 fjx2 = _mm_setzero_ps();
2151 fjy2 = _mm_setzero_ps();
2152 fjz2 = _mm_setzero_ps();
2153 fjx3 = _mm_setzero_ps();
2154 fjy3 = _mm_setzero_ps();
2155 fjz3 = _mm_setzero_ps();
2157 /**************************
2158 * CALCULATE INTERACTIONS *
2159 **************************/
2161 r00 = _mm_mul_ps(rsq00,rinv00);
2162 r00 = _mm_andnot_ps(dummy_mask,r00);
2164 /* Calculate table index by multiplying r with table scale and truncate to integer */
2165 rt = _mm_mul_ps(r00,vftabscale);
2166 vfitab = _mm_cvttps_epi32(rt);
2167 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2168 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2170 /* CUBIC SPLINE TABLE DISPERSION */
2171 vfitab = _mm_add_epi32(vfitab,ifour);
2172 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2173 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2174 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2175 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2176 _MM_TRANSPOSE4_PS(Y,F,G,H);
2177 Heps = _mm_mul_ps(vfeps,H);
2178 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2179 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2180 fvdw6 = _mm_mul_ps(c6_00,FF);
2182 /* CUBIC SPLINE TABLE REPULSION */
2183 vfitab = _mm_add_epi32(vfitab,ifour);
2184 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2185 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2186 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2187 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2188 _MM_TRANSPOSE4_PS(Y,F,G,H);
2189 Heps = _mm_mul_ps(vfeps,H);
2190 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2191 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2192 fvdw12 = _mm_mul_ps(c12_00,FF);
2193 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2197 fscal = _mm_andnot_ps(dummy_mask,fscal);
2199 /* Calculate temporary vectorial force */
2200 tx = _mm_mul_ps(fscal,dx00);
2201 ty = _mm_mul_ps(fscal,dy00);
2202 tz = _mm_mul_ps(fscal,dz00);
2204 /* Update vectorial force */
2205 fix0 = _mm_add_ps(fix0,tx);
2206 fiy0 = _mm_add_ps(fiy0,ty);
2207 fiz0 = _mm_add_ps(fiz0,tz);
2209 fjx0 = _mm_add_ps(fjx0,tx);
2210 fjy0 = _mm_add_ps(fjy0,ty);
2211 fjz0 = _mm_add_ps(fjz0,tz);
2213 /**************************
2214 * CALCULATE INTERACTIONS *
2215 **************************/
2217 r11 = _mm_mul_ps(rsq11,rinv11);
2218 r11 = _mm_andnot_ps(dummy_mask,r11);
2220 /* Calculate table index by multiplying r with table scale and truncate to integer */
2221 rt = _mm_mul_ps(r11,vftabscale);
2222 vfitab = _mm_cvttps_epi32(rt);
2223 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2224 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2226 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2227 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2228 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2229 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2230 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2231 _MM_TRANSPOSE4_PS(Y,F,G,H);
2232 Heps = _mm_mul_ps(vfeps,H);
2233 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2234 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2235 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2239 fscal = _mm_andnot_ps(dummy_mask,fscal);
2241 /* Calculate temporary vectorial force */
2242 tx = _mm_mul_ps(fscal,dx11);
2243 ty = _mm_mul_ps(fscal,dy11);
2244 tz = _mm_mul_ps(fscal,dz11);
2246 /* Update vectorial force */
2247 fix1 = _mm_add_ps(fix1,tx);
2248 fiy1 = _mm_add_ps(fiy1,ty);
2249 fiz1 = _mm_add_ps(fiz1,tz);
2251 fjx1 = _mm_add_ps(fjx1,tx);
2252 fjy1 = _mm_add_ps(fjy1,ty);
2253 fjz1 = _mm_add_ps(fjz1,tz);
2255 /**************************
2256 * CALCULATE INTERACTIONS *
2257 **************************/
2259 r12 = _mm_mul_ps(rsq12,rinv12);
2260 r12 = _mm_andnot_ps(dummy_mask,r12);
2262 /* Calculate table index by multiplying r with table scale and truncate to integer */
2263 rt = _mm_mul_ps(r12,vftabscale);
2264 vfitab = _mm_cvttps_epi32(rt);
2265 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2266 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2268 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2269 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2270 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2271 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2272 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2273 _MM_TRANSPOSE4_PS(Y,F,G,H);
2274 Heps = _mm_mul_ps(vfeps,H);
2275 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2276 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2277 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2281 fscal = _mm_andnot_ps(dummy_mask,fscal);
2283 /* Calculate temporary vectorial force */
2284 tx = _mm_mul_ps(fscal,dx12);
2285 ty = _mm_mul_ps(fscal,dy12);
2286 tz = _mm_mul_ps(fscal,dz12);
2288 /* Update vectorial force */
2289 fix1 = _mm_add_ps(fix1,tx);
2290 fiy1 = _mm_add_ps(fiy1,ty);
2291 fiz1 = _mm_add_ps(fiz1,tz);
2293 fjx2 = _mm_add_ps(fjx2,tx);
2294 fjy2 = _mm_add_ps(fjy2,ty);
2295 fjz2 = _mm_add_ps(fjz2,tz);
2297 /**************************
2298 * CALCULATE INTERACTIONS *
2299 **************************/
2301 r13 = _mm_mul_ps(rsq13,rinv13);
2302 r13 = _mm_andnot_ps(dummy_mask,r13);
2304 /* Calculate table index by multiplying r with table scale and truncate to integer */
2305 rt = _mm_mul_ps(r13,vftabscale);
2306 vfitab = _mm_cvttps_epi32(rt);
2307 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2308 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2310 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2311 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2312 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2313 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2314 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2315 _MM_TRANSPOSE4_PS(Y,F,G,H);
2316 Heps = _mm_mul_ps(vfeps,H);
2317 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2318 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2319 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2323 fscal = _mm_andnot_ps(dummy_mask,fscal);
2325 /* Calculate temporary vectorial force */
2326 tx = _mm_mul_ps(fscal,dx13);
2327 ty = _mm_mul_ps(fscal,dy13);
2328 tz = _mm_mul_ps(fscal,dz13);
2330 /* Update vectorial force */
2331 fix1 = _mm_add_ps(fix1,tx);
2332 fiy1 = _mm_add_ps(fiy1,ty);
2333 fiz1 = _mm_add_ps(fiz1,tz);
2335 fjx3 = _mm_add_ps(fjx3,tx);
2336 fjy3 = _mm_add_ps(fjy3,ty);
2337 fjz3 = _mm_add_ps(fjz3,tz);
2339 /**************************
2340 * CALCULATE INTERACTIONS *
2341 **************************/
2343 r21 = _mm_mul_ps(rsq21,rinv21);
2344 r21 = _mm_andnot_ps(dummy_mask,r21);
2346 /* Calculate table index by multiplying r with table scale and truncate to integer */
2347 rt = _mm_mul_ps(r21,vftabscale);
2348 vfitab = _mm_cvttps_epi32(rt);
2349 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2350 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2352 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2353 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2354 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2355 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2356 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2357 _MM_TRANSPOSE4_PS(Y,F,G,H);
2358 Heps = _mm_mul_ps(vfeps,H);
2359 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2360 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2361 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2365 fscal = _mm_andnot_ps(dummy_mask,fscal);
2367 /* Calculate temporary vectorial force */
2368 tx = _mm_mul_ps(fscal,dx21);
2369 ty = _mm_mul_ps(fscal,dy21);
2370 tz = _mm_mul_ps(fscal,dz21);
2372 /* Update vectorial force */
2373 fix2 = _mm_add_ps(fix2,tx);
2374 fiy2 = _mm_add_ps(fiy2,ty);
2375 fiz2 = _mm_add_ps(fiz2,tz);
2377 fjx1 = _mm_add_ps(fjx1,tx);
2378 fjy1 = _mm_add_ps(fjy1,ty);
2379 fjz1 = _mm_add_ps(fjz1,tz);
2381 /**************************
2382 * CALCULATE INTERACTIONS *
2383 **************************/
2385 r22 = _mm_mul_ps(rsq22,rinv22);
2386 r22 = _mm_andnot_ps(dummy_mask,r22);
2388 /* Calculate table index by multiplying r with table scale and truncate to integer */
2389 rt = _mm_mul_ps(r22,vftabscale);
2390 vfitab = _mm_cvttps_epi32(rt);
2391 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2392 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2394 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2395 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2396 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2397 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2398 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2399 _MM_TRANSPOSE4_PS(Y,F,G,H);
2400 Heps = _mm_mul_ps(vfeps,H);
2401 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2402 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2403 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2407 fscal = _mm_andnot_ps(dummy_mask,fscal);
2409 /* Calculate temporary vectorial force */
2410 tx = _mm_mul_ps(fscal,dx22);
2411 ty = _mm_mul_ps(fscal,dy22);
2412 tz = _mm_mul_ps(fscal,dz22);
2414 /* Update vectorial force */
2415 fix2 = _mm_add_ps(fix2,tx);
2416 fiy2 = _mm_add_ps(fiy2,ty);
2417 fiz2 = _mm_add_ps(fiz2,tz);
2419 fjx2 = _mm_add_ps(fjx2,tx);
2420 fjy2 = _mm_add_ps(fjy2,ty);
2421 fjz2 = _mm_add_ps(fjz2,tz);
2423 /**************************
2424 * CALCULATE INTERACTIONS *
2425 **************************/
2427 r23 = _mm_mul_ps(rsq23,rinv23);
2428 r23 = _mm_andnot_ps(dummy_mask,r23);
2430 /* Calculate table index by multiplying r with table scale and truncate to integer */
2431 rt = _mm_mul_ps(r23,vftabscale);
2432 vfitab = _mm_cvttps_epi32(rt);
2433 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2434 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2436 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2437 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2438 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2439 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2440 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2441 _MM_TRANSPOSE4_PS(Y,F,G,H);
2442 Heps = _mm_mul_ps(vfeps,H);
2443 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2444 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2445 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2449 fscal = _mm_andnot_ps(dummy_mask,fscal);
2451 /* Calculate temporary vectorial force */
2452 tx = _mm_mul_ps(fscal,dx23);
2453 ty = _mm_mul_ps(fscal,dy23);
2454 tz = _mm_mul_ps(fscal,dz23);
2456 /* Update vectorial force */
2457 fix2 = _mm_add_ps(fix2,tx);
2458 fiy2 = _mm_add_ps(fiy2,ty);
2459 fiz2 = _mm_add_ps(fiz2,tz);
2461 fjx3 = _mm_add_ps(fjx3,tx);
2462 fjy3 = _mm_add_ps(fjy3,ty);
2463 fjz3 = _mm_add_ps(fjz3,tz);
2465 /**************************
2466 * CALCULATE INTERACTIONS *
2467 **************************/
2469 r31 = _mm_mul_ps(rsq31,rinv31);
2470 r31 = _mm_andnot_ps(dummy_mask,r31);
2472 /* Calculate table index by multiplying r with table scale and truncate to integer */
2473 rt = _mm_mul_ps(r31,vftabscale);
2474 vfitab = _mm_cvttps_epi32(rt);
2475 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2476 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2478 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2479 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2480 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2481 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2482 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2483 _MM_TRANSPOSE4_PS(Y,F,G,H);
2484 Heps = _mm_mul_ps(vfeps,H);
2485 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2486 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2487 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2491 fscal = _mm_andnot_ps(dummy_mask,fscal);
2493 /* Calculate temporary vectorial force */
2494 tx = _mm_mul_ps(fscal,dx31);
2495 ty = _mm_mul_ps(fscal,dy31);
2496 tz = _mm_mul_ps(fscal,dz31);
2498 /* Update vectorial force */
2499 fix3 = _mm_add_ps(fix3,tx);
2500 fiy3 = _mm_add_ps(fiy3,ty);
2501 fiz3 = _mm_add_ps(fiz3,tz);
2503 fjx1 = _mm_add_ps(fjx1,tx);
2504 fjy1 = _mm_add_ps(fjy1,ty);
2505 fjz1 = _mm_add_ps(fjz1,tz);
2507 /**************************
2508 * CALCULATE INTERACTIONS *
2509 **************************/
2511 r32 = _mm_mul_ps(rsq32,rinv32);
2512 r32 = _mm_andnot_ps(dummy_mask,r32);
2514 /* Calculate table index by multiplying r with table scale and truncate to integer */
2515 rt = _mm_mul_ps(r32,vftabscale);
2516 vfitab = _mm_cvttps_epi32(rt);
2517 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2518 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2520 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2521 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2522 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2523 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2524 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2525 _MM_TRANSPOSE4_PS(Y,F,G,H);
2526 Heps = _mm_mul_ps(vfeps,H);
2527 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2528 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2529 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2533 fscal = _mm_andnot_ps(dummy_mask,fscal);
2535 /* Calculate temporary vectorial force */
2536 tx = _mm_mul_ps(fscal,dx32);
2537 ty = _mm_mul_ps(fscal,dy32);
2538 tz = _mm_mul_ps(fscal,dz32);
2540 /* Update vectorial force */
2541 fix3 = _mm_add_ps(fix3,tx);
2542 fiy3 = _mm_add_ps(fiy3,ty);
2543 fiz3 = _mm_add_ps(fiz3,tz);
2545 fjx2 = _mm_add_ps(fjx2,tx);
2546 fjy2 = _mm_add_ps(fjy2,ty);
2547 fjz2 = _mm_add_ps(fjz2,tz);
2549 /**************************
2550 * CALCULATE INTERACTIONS *
2551 **************************/
2553 r33 = _mm_mul_ps(rsq33,rinv33);
2554 r33 = _mm_andnot_ps(dummy_mask,r33);
2556 /* Calculate table index by multiplying r with table scale and truncate to integer */
2557 rt = _mm_mul_ps(r33,vftabscale);
2558 vfitab = _mm_cvttps_epi32(rt);
2559 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2560 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2562 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2563 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2564 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2565 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2566 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2567 _MM_TRANSPOSE4_PS(Y,F,G,H);
2568 Heps = _mm_mul_ps(vfeps,H);
2569 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2570 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2571 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2575 fscal = _mm_andnot_ps(dummy_mask,fscal);
2577 /* Calculate temporary vectorial force */
2578 tx = _mm_mul_ps(fscal,dx33);
2579 ty = _mm_mul_ps(fscal,dy33);
2580 tz = _mm_mul_ps(fscal,dz33);
2582 /* Update vectorial force */
2583 fix3 = _mm_add_ps(fix3,tx);
2584 fiy3 = _mm_add_ps(fiy3,ty);
2585 fiz3 = _mm_add_ps(fiz3,tz);
2587 fjx3 = _mm_add_ps(fjx3,tx);
2588 fjy3 = _mm_add_ps(fjy3,ty);
2589 fjz3 = _mm_add_ps(fjz3,tz);
2591 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2592 f+j_coord_offsetC,f+j_coord_offsetD,
2593 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2594 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2596 /* Inner loop uses 412 flops */
2599 /* End of innermost loop */
2601 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2602 f+i_coord_offset,fshift+i_shift_offset);
2604 /* Increment number of inner iterations */
2605 inneriter += j_index_end - j_index_start;
2607 /* Outer loop uses 36 flops */
2610 /* Increment number of outer iterations */
2613 /* Update outer/inner flops */
2615 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*412);