2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
73 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
75 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
77 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
90 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
94 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
96 __m128i ifour = _mm_set1_epi32(4);
97 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99 __m128 dummy_mask,cutoff_mask;
100 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
101 __m128 one = _mm_set1_ps(1.0);
102 __m128 two = _mm_set1_ps(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm_set1_ps(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_elec_vdw->data;
121 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
126 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
127 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
128 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
130 jq0 = _mm_set1_ps(charge[inr+0]);
131 jq1 = _mm_set1_ps(charge[inr+1]);
132 jq2 = _mm_set1_ps(charge[inr+2]);
133 vdwjidx0A = 2*vdwtype[inr+0];
134 qq00 = _mm_mul_ps(iq0,jq0);
135 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
136 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
137 qq01 = _mm_mul_ps(iq0,jq1);
138 qq02 = _mm_mul_ps(iq0,jq2);
139 qq10 = _mm_mul_ps(iq1,jq0);
140 qq11 = _mm_mul_ps(iq1,jq1);
141 qq12 = _mm_mul_ps(iq1,jq2);
142 qq20 = _mm_mul_ps(iq2,jq0);
143 qq21 = _mm_mul_ps(iq2,jq1);
144 qq22 = _mm_mul_ps(iq2,jq2);
146 /* Avoid stupid compiler warnings */
147 jnrA = jnrB = jnrC = jnrD = 0;
156 /* Start outer loop over neighborlists */
157 for(iidx=0; iidx<nri; iidx++)
159 /* Load shift vector for this list */
160 i_shift_offset = DIM*shiftidx[iidx];
161 shX = shiftvec[i_shift_offset+XX];
162 shY = shiftvec[i_shift_offset+YY];
163 shZ = shiftvec[i_shift_offset+ZZ];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
175 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
176 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
177 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
178 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
179 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
180 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
181 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
182 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
184 fix0 = _mm_setzero_ps();
185 fiy0 = _mm_setzero_ps();
186 fiz0 = _mm_setzero_ps();
187 fix1 = _mm_setzero_ps();
188 fiy1 = _mm_setzero_ps();
189 fiz1 = _mm_setzero_ps();
190 fix2 = _mm_setzero_ps();
191 fiy2 = _mm_setzero_ps();
192 fiz2 = _mm_setzero_ps();
194 /* Reset potential sums */
195 velecsum = _mm_setzero_ps();
196 vvdwsum = _mm_setzero_ps();
198 /* Start inner kernel loop */
199 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
202 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA = DIM*jnrA;
209 j_coord_offsetB = DIM*jnrB;
210 j_coord_offsetC = DIM*jnrC;
211 j_coord_offsetD = DIM*jnrD;
213 /* load j atom coordinates */
214 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
215 x+j_coord_offsetC,x+j_coord_offsetD,
216 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
218 /* Calculate displacement vector */
219 dx00 = _mm_sub_ps(ix0,jx0);
220 dy00 = _mm_sub_ps(iy0,jy0);
221 dz00 = _mm_sub_ps(iz0,jz0);
222 dx01 = _mm_sub_ps(ix0,jx1);
223 dy01 = _mm_sub_ps(iy0,jy1);
224 dz01 = _mm_sub_ps(iz0,jz1);
225 dx02 = _mm_sub_ps(ix0,jx2);
226 dy02 = _mm_sub_ps(iy0,jy2);
227 dz02 = _mm_sub_ps(iz0,jz2);
228 dx10 = _mm_sub_ps(ix1,jx0);
229 dy10 = _mm_sub_ps(iy1,jy0);
230 dz10 = _mm_sub_ps(iz1,jz0);
231 dx11 = _mm_sub_ps(ix1,jx1);
232 dy11 = _mm_sub_ps(iy1,jy1);
233 dz11 = _mm_sub_ps(iz1,jz1);
234 dx12 = _mm_sub_ps(ix1,jx2);
235 dy12 = _mm_sub_ps(iy1,jy2);
236 dz12 = _mm_sub_ps(iz1,jz2);
237 dx20 = _mm_sub_ps(ix2,jx0);
238 dy20 = _mm_sub_ps(iy2,jy0);
239 dz20 = _mm_sub_ps(iz2,jz0);
240 dx21 = _mm_sub_ps(ix2,jx1);
241 dy21 = _mm_sub_ps(iy2,jy1);
242 dz21 = _mm_sub_ps(iz2,jz1);
243 dx22 = _mm_sub_ps(ix2,jx2);
244 dy22 = _mm_sub_ps(iy2,jy2);
245 dz22 = _mm_sub_ps(iz2,jz2);
247 /* Calculate squared distance and things based on it */
248 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
249 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
250 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
251 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
252 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
253 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
254 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
255 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
256 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
258 rinv00 = gmx_mm_invsqrt_ps(rsq00);
259 rinv01 = gmx_mm_invsqrt_ps(rsq01);
260 rinv02 = gmx_mm_invsqrt_ps(rsq02);
261 rinv10 = gmx_mm_invsqrt_ps(rsq10);
262 rinv11 = gmx_mm_invsqrt_ps(rsq11);
263 rinv12 = gmx_mm_invsqrt_ps(rsq12);
264 rinv20 = gmx_mm_invsqrt_ps(rsq20);
265 rinv21 = gmx_mm_invsqrt_ps(rsq21);
266 rinv22 = gmx_mm_invsqrt_ps(rsq22);
268 fjx0 = _mm_setzero_ps();
269 fjy0 = _mm_setzero_ps();
270 fjz0 = _mm_setzero_ps();
271 fjx1 = _mm_setzero_ps();
272 fjy1 = _mm_setzero_ps();
273 fjz1 = _mm_setzero_ps();
274 fjx2 = _mm_setzero_ps();
275 fjy2 = _mm_setzero_ps();
276 fjz2 = _mm_setzero_ps();
278 /**************************
279 * CALCULATE INTERACTIONS *
280 **************************/
282 r00 = _mm_mul_ps(rsq00,rinv00);
284 /* Calculate table index by multiplying r with table scale and truncate to integer */
285 rt = _mm_mul_ps(r00,vftabscale);
286 vfitab = _mm_cvttps_epi32(rt);
287 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
288 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
290 /* CUBIC SPLINE TABLE ELECTROSTATICS */
291 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
292 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
293 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
294 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
295 _MM_TRANSPOSE4_PS(Y,F,G,H);
296 Heps = _mm_mul_ps(vfeps,H);
297 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
298 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
299 velec = _mm_mul_ps(qq00,VV);
300 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
301 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
303 /* CUBIC SPLINE TABLE DISPERSION */
304 vfitab = _mm_add_epi32(vfitab,ifour);
305 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
306 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
307 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
308 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
309 _MM_TRANSPOSE4_PS(Y,F,G,H);
310 Heps = _mm_mul_ps(vfeps,H);
311 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
312 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
313 vvdw6 = _mm_mul_ps(c6_00,VV);
314 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
315 fvdw6 = _mm_mul_ps(c6_00,FF);
317 /* CUBIC SPLINE TABLE REPULSION */
318 vfitab = _mm_add_epi32(vfitab,ifour);
319 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
320 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
321 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
322 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
323 _MM_TRANSPOSE4_PS(Y,F,G,H);
324 Heps = _mm_mul_ps(vfeps,H);
325 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
326 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
327 vvdw12 = _mm_mul_ps(c12_00,VV);
328 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
329 fvdw12 = _mm_mul_ps(c12_00,FF);
330 vvdw = _mm_add_ps(vvdw12,vvdw6);
331 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
333 /* Update potential sum for this i atom from the interaction with this j atom. */
334 velecsum = _mm_add_ps(velecsum,velec);
335 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
337 fscal = _mm_add_ps(felec,fvdw);
339 /* Calculate temporary vectorial force */
340 tx = _mm_mul_ps(fscal,dx00);
341 ty = _mm_mul_ps(fscal,dy00);
342 tz = _mm_mul_ps(fscal,dz00);
344 /* Update vectorial force */
345 fix0 = _mm_add_ps(fix0,tx);
346 fiy0 = _mm_add_ps(fiy0,ty);
347 fiz0 = _mm_add_ps(fiz0,tz);
349 fjx0 = _mm_add_ps(fjx0,tx);
350 fjy0 = _mm_add_ps(fjy0,ty);
351 fjz0 = _mm_add_ps(fjz0,tz);
353 /**************************
354 * CALCULATE INTERACTIONS *
355 **************************/
357 r01 = _mm_mul_ps(rsq01,rinv01);
359 /* Calculate table index by multiplying r with table scale and truncate to integer */
360 rt = _mm_mul_ps(r01,vftabscale);
361 vfitab = _mm_cvttps_epi32(rt);
362 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
363 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
365 /* CUBIC SPLINE TABLE ELECTROSTATICS */
366 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
367 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
368 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
369 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
370 _MM_TRANSPOSE4_PS(Y,F,G,H);
371 Heps = _mm_mul_ps(vfeps,H);
372 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
373 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
374 velec = _mm_mul_ps(qq01,VV);
375 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
376 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm_add_ps(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm_mul_ps(fscal,dx01);
385 ty = _mm_mul_ps(fscal,dy01);
386 tz = _mm_mul_ps(fscal,dz01);
388 /* Update vectorial force */
389 fix0 = _mm_add_ps(fix0,tx);
390 fiy0 = _mm_add_ps(fiy0,ty);
391 fiz0 = _mm_add_ps(fiz0,tz);
393 fjx1 = _mm_add_ps(fjx1,tx);
394 fjy1 = _mm_add_ps(fjy1,ty);
395 fjz1 = _mm_add_ps(fjz1,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 r02 = _mm_mul_ps(rsq02,rinv02);
403 /* Calculate table index by multiplying r with table scale and truncate to integer */
404 rt = _mm_mul_ps(r02,vftabscale);
405 vfitab = _mm_cvttps_epi32(rt);
406 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
407 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
409 /* CUBIC SPLINE TABLE ELECTROSTATICS */
410 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
411 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
412 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
413 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
414 _MM_TRANSPOSE4_PS(Y,F,G,H);
415 Heps = _mm_mul_ps(vfeps,H);
416 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
417 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
418 velec = _mm_mul_ps(qq02,VV);
419 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
420 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
422 /* Update potential sum for this i atom from the interaction with this j atom. */
423 velecsum = _mm_add_ps(velecsum,velec);
427 /* Calculate temporary vectorial force */
428 tx = _mm_mul_ps(fscal,dx02);
429 ty = _mm_mul_ps(fscal,dy02);
430 tz = _mm_mul_ps(fscal,dz02);
432 /* Update vectorial force */
433 fix0 = _mm_add_ps(fix0,tx);
434 fiy0 = _mm_add_ps(fiy0,ty);
435 fiz0 = _mm_add_ps(fiz0,tz);
437 fjx2 = _mm_add_ps(fjx2,tx);
438 fjy2 = _mm_add_ps(fjy2,ty);
439 fjz2 = _mm_add_ps(fjz2,tz);
441 /**************************
442 * CALCULATE INTERACTIONS *
443 **************************/
445 r10 = _mm_mul_ps(rsq10,rinv10);
447 /* Calculate table index by multiplying r with table scale and truncate to integer */
448 rt = _mm_mul_ps(r10,vftabscale);
449 vfitab = _mm_cvttps_epi32(rt);
450 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
451 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
453 /* CUBIC SPLINE TABLE ELECTROSTATICS */
454 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
455 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
456 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
457 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
458 _MM_TRANSPOSE4_PS(Y,F,G,H);
459 Heps = _mm_mul_ps(vfeps,H);
460 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
461 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
462 velec = _mm_mul_ps(qq10,VV);
463 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
464 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
466 /* Update potential sum for this i atom from the interaction with this j atom. */
467 velecsum = _mm_add_ps(velecsum,velec);
471 /* Calculate temporary vectorial force */
472 tx = _mm_mul_ps(fscal,dx10);
473 ty = _mm_mul_ps(fscal,dy10);
474 tz = _mm_mul_ps(fscal,dz10);
476 /* Update vectorial force */
477 fix1 = _mm_add_ps(fix1,tx);
478 fiy1 = _mm_add_ps(fiy1,ty);
479 fiz1 = _mm_add_ps(fiz1,tz);
481 fjx0 = _mm_add_ps(fjx0,tx);
482 fjy0 = _mm_add_ps(fjy0,ty);
483 fjz0 = _mm_add_ps(fjz0,tz);
485 /**************************
486 * CALCULATE INTERACTIONS *
487 **************************/
489 r11 = _mm_mul_ps(rsq11,rinv11);
491 /* Calculate table index by multiplying r with table scale and truncate to integer */
492 rt = _mm_mul_ps(r11,vftabscale);
493 vfitab = _mm_cvttps_epi32(rt);
494 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
495 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
497 /* CUBIC SPLINE TABLE ELECTROSTATICS */
498 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
499 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
500 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
501 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
502 _MM_TRANSPOSE4_PS(Y,F,G,H);
503 Heps = _mm_mul_ps(vfeps,H);
504 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
505 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
506 velec = _mm_mul_ps(qq11,VV);
507 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
508 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
510 /* Update potential sum for this i atom from the interaction with this j atom. */
511 velecsum = _mm_add_ps(velecsum,velec);
515 /* Calculate temporary vectorial force */
516 tx = _mm_mul_ps(fscal,dx11);
517 ty = _mm_mul_ps(fscal,dy11);
518 tz = _mm_mul_ps(fscal,dz11);
520 /* Update vectorial force */
521 fix1 = _mm_add_ps(fix1,tx);
522 fiy1 = _mm_add_ps(fiy1,ty);
523 fiz1 = _mm_add_ps(fiz1,tz);
525 fjx1 = _mm_add_ps(fjx1,tx);
526 fjy1 = _mm_add_ps(fjy1,ty);
527 fjz1 = _mm_add_ps(fjz1,tz);
529 /**************************
530 * CALCULATE INTERACTIONS *
531 **************************/
533 r12 = _mm_mul_ps(rsq12,rinv12);
535 /* Calculate table index by multiplying r with table scale and truncate to integer */
536 rt = _mm_mul_ps(r12,vftabscale);
537 vfitab = _mm_cvttps_epi32(rt);
538 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
539 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
541 /* CUBIC SPLINE TABLE ELECTROSTATICS */
542 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
543 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
544 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
545 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
546 _MM_TRANSPOSE4_PS(Y,F,G,H);
547 Heps = _mm_mul_ps(vfeps,H);
548 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
549 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
550 velec = _mm_mul_ps(qq12,VV);
551 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
552 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
554 /* Update potential sum for this i atom from the interaction with this j atom. */
555 velecsum = _mm_add_ps(velecsum,velec);
559 /* Calculate temporary vectorial force */
560 tx = _mm_mul_ps(fscal,dx12);
561 ty = _mm_mul_ps(fscal,dy12);
562 tz = _mm_mul_ps(fscal,dz12);
564 /* Update vectorial force */
565 fix1 = _mm_add_ps(fix1,tx);
566 fiy1 = _mm_add_ps(fiy1,ty);
567 fiz1 = _mm_add_ps(fiz1,tz);
569 fjx2 = _mm_add_ps(fjx2,tx);
570 fjy2 = _mm_add_ps(fjy2,ty);
571 fjz2 = _mm_add_ps(fjz2,tz);
573 /**************************
574 * CALCULATE INTERACTIONS *
575 **************************/
577 r20 = _mm_mul_ps(rsq20,rinv20);
579 /* Calculate table index by multiplying r with table scale and truncate to integer */
580 rt = _mm_mul_ps(r20,vftabscale);
581 vfitab = _mm_cvttps_epi32(rt);
582 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
583 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
585 /* CUBIC SPLINE TABLE ELECTROSTATICS */
586 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
587 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
588 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
589 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
590 _MM_TRANSPOSE4_PS(Y,F,G,H);
591 Heps = _mm_mul_ps(vfeps,H);
592 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
593 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
594 velec = _mm_mul_ps(qq20,VV);
595 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
596 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velecsum = _mm_add_ps(velecsum,velec);
603 /* Calculate temporary vectorial force */
604 tx = _mm_mul_ps(fscal,dx20);
605 ty = _mm_mul_ps(fscal,dy20);
606 tz = _mm_mul_ps(fscal,dz20);
608 /* Update vectorial force */
609 fix2 = _mm_add_ps(fix2,tx);
610 fiy2 = _mm_add_ps(fiy2,ty);
611 fiz2 = _mm_add_ps(fiz2,tz);
613 fjx0 = _mm_add_ps(fjx0,tx);
614 fjy0 = _mm_add_ps(fjy0,ty);
615 fjz0 = _mm_add_ps(fjz0,tz);
617 /**************************
618 * CALCULATE INTERACTIONS *
619 **************************/
621 r21 = _mm_mul_ps(rsq21,rinv21);
623 /* Calculate table index by multiplying r with table scale and truncate to integer */
624 rt = _mm_mul_ps(r21,vftabscale);
625 vfitab = _mm_cvttps_epi32(rt);
626 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
627 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
629 /* CUBIC SPLINE TABLE ELECTROSTATICS */
630 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
631 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
632 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
633 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
634 _MM_TRANSPOSE4_PS(Y,F,G,H);
635 Heps = _mm_mul_ps(vfeps,H);
636 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
637 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
638 velec = _mm_mul_ps(qq21,VV);
639 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
640 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
642 /* Update potential sum for this i atom from the interaction with this j atom. */
643 velecsum = _mm_add_ps(velecsum,velec);
647 /* Calculate temporary vectorial force */
648 tx = _mm_mul_ps(fscal,dx21);
649 ty = _mm_mul_ps(fscal,dy21);
650 tz = _mm_mul_ps(fscal,dz21);
652 /* Update vectorial force */
653 fix2 = _mm_add_ps(fix2,tx);
654 fiy2 = _mm_add_ps(fiy2,ty);
655 fiz2 = _mm_add_ps(fiz2,tz);
657 fjx1 = _mm_add_ps(fjx1,tx);
658 fjy1 = _mm_add_ps(fjy1,ty);
659 fjz1 = _mm_add_ps(fjz1,tz);
661 /**************************
662 * CALCULATE INTERACTIONS *
663 **************************/
665 r22 = _mm_mul_ps(rsq22,rinv22);
667 /* Calculate table index by multiplying r with table scale and truncate to integer */
668 rt = _mm_mul_ps(r22,vftabscale);
669 vfitab = _mm_cvttps_epi32(rt);
670 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
671 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
673 /* CUBIC SPLINE TABLE ELECTROSTATICS */
674 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
675 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
676 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
677 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
678 _MM_TRANSPOSE4_PS(Y,F,G,H);
679 Heps = _mm_mul_ps(vfeps,H);
680 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
681 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
682 velec = _mm_mul_ps(qq22,VV);
683 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
684 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
686 /* Update potential sum for this i atom from the interaction with this j atom. */
687 velecsum = _mm_add_ps(velecsum,velec);
691 /* Calculate temporary vectorial force */
692 tx = _mm_mul_ps(fscal,dx22);
693 ty = _mm_mul_ps(fscal,dy22);
694 tz = _mm_mul_ps(fscal,dz22);
696 /* Update vectorial force */
697 fix2 = _mm_add_ps(fix2,tx);
698 fiy2 = _mm_add_ps(fiy2,ty);
699 fiz2 = _mm_add_ps(fiz2,tz);
701 fjx2 = _mm_add_ps(fjx2,tx);
702 fjy2 = _mm_add_ps(fjy2,ty);
703 fjz2 = _mm_add_ps(fjz2,tz);
705 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
706 f+j_coord_offsetC,f+j_coord_offsetD,
707 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
709 /* Inner loop uses 417 flops */
715 /* Get j neighbor index, and coordinate index */
721 /* Sign of each element will be negative for non-real atoms.
722 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
723 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
725 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
726 jnrA = (jnrA>=0) ? jnrA : 0;
727 jnrB = (jnrB>=0) ? jnrB : 0;
728 jnrC = (jnrC>=0) ? jnrC : 0;
729 jnrD = (jnrD>=0) ? jnrD : 0;
731 j_coord_offsetA = DIM*jnrA;
732 j_coord_offsetB = DIM*jnrB;
733 j_coord_offsetC = DIM*jnrC;
734 j_coord_offsetD = DIM*jnrD;
736 /* load j atom coordinates */
737 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
738 x+j_coord_offsetC,x+j_coord_offsetD,
739 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
741 /* Calculate displacement vector */
742 dx00 = _mm_sub_ps(ix0,jx0);
743 dy00 = _mm_sub_ps(iy0,jy0);
744 dz00 = _mm_sub_ps(iz0,jz0);
745 dx01 = _mm_sub_ps(ix0,jx1);
746 dy01 = _mm_sub_ps(iy0,jy1);
747 dz01 = _mm_sub_ps(iz0,jz1);
748 dx02 = _mm_sub_ps(ix0,jx2);
749 dy02 = _mm_sub_ps(iy0,jy2);
750 dz02 = _mm_sub_ps(iz0,jz2);
751 dx10 = _mm_sub_ps(ix1,jx0);
752 dy10 = _mm_sub_ps(iy1,jy0);
753 dz10 = _mm_sub_ps(iz1,jz0);
754 dx11 = _mm_sub_ps(ix1,jx1);
755 dy11 = _mm_sub_ps(iy1,jy1);
756 dz11 = _mm_sub_ps(iz1,jz1);
757 dx12 = _mm_sub_ps(ix1,jx2);
758 dy12 = _mm_sub_ps(iy1,jy2);
759 dz12 = _mm_sub_ps(iz1,jz2);
760 dx20 = _mm_sub_ps(ix2,jx0);
761 dy20 = _mm_sub_ps(iy2,jy0);
762 dz20 = _mm_sub_ps(iz2,jz0);
763 dx21 = _mm_sub_ps(ix2,jx1);
764 dy21 = _mm_sub_ps(iy2,jy1);
765 dz21 = _mm_sub_ps(iz2,jz1);
766 dx22 = _mm_sub_ps(ix2,jx2);
767 dy22 = _mm_sub_ps(iy2,jy2);
768 dz22 = _mm_sub_ps(iz2,jz2);
770 /* Calculate squared distance and things based on it */
771 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
772 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
773 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
774 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
775 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
776 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
777 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
778 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
779 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
781 rinv00 = gmx_mm_invsqrt_ps(rsq00);
782 rinv01 = gmx_mm_invsqrt_ps(rsq01);
783 rinv02 = gmx_mm_invsqrt_ps(rsq02);
784 rinv10 = gmx_mm_invsqrt_ps(rsq10);
785 rinv11 = gmx_mm_invsqrt_ps(rsq11);
786 rinv12 = gmx_mm_invsqrt_ps(rsq12);
787 rinv20 = gmx_mm_invsqrt_ps(rsq20);
788 rinv21 = gmx_mm_invsqrt_ps(rsq21);
789 rinv22 = gmx_mm_invsqrt_ps(rsq22);
791 fjx0 = _mm_setzero_ps();
792 fjy0 = _mm_setzero_ps();
793 fjz0 = _mm_setzero_ps();
794 fjx1 = _mm_setzero_ps();
795 fjy1 = _mm_setzero_ps();
796 fjz1 = _mm_setzero_ps();
797 fjx2 = _mm_setzero_ps();
798 fjy2 = _mm_setzero_ps();
799 fjz2 = _mm_setzero_ps();
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 r00 = _mm_mul_ps(rsq00,rinv00);
806 r00 = _mm_andnot_ps(dummy_mask,r00);
808 /* Calculate table index by multiplying r with table scale and truncate to integer */
809 rt = _mm_mul_ps(r00,vftabscale);
810 vfitab = _mm_cvttps_epi32(rt);
811 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
812 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
814 /* CUBIC SPLINE TABLE ELECTROSTATICS */
815 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
816 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
817 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
818 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
819 _MM_TRANSPOSE4_PS(Y,F,G,H);
820 Heps = _mm_mul_ps(vfeps,H);
821 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
822 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
823 velec = _mm_mul_ps(qq00,VV);
824 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
825 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
827 /* CUBIC SPLINE TABLE DISPERSION */
828 vfitab = _mm_add_epi32(vfitab,ifour);
829 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
830 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
831 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
832 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
833 _MM_TRANSPOSE4_PS(Y,F,G,H);
834 Heps = _mm_mul_ps(vfeps,H);
835 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
836 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
837 vvdw6 = _mm_mul_ps(c6_00,VV);
838 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
839 fvdw6 = _mm_mul_ps(c6_00,FF);
841 /* CUBIC SPLINE TABLE REPULSION */
842 vfitab = _mm_add_epi32(vfitab,ifour);
843 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
844 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
845 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
846 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
847 _MM_TRANSPOSE4_PS(Y,F,G,H);
848 Heps = _mm_mul_ps(vfeps,H);
849 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
850 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
851 vvdw12 = _mm_mul_ps(c12_00,VV);
852 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
853 fvdw12 = _mm_mul_ps(c12_00,FF);
854 vvdw = _mm_add_ps(vvdw12,vvdw6);
855 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
857 /* Update potential sum for this i atom from the interaction with this j atom. */
858 velec = _mm_andnot_ps(dummy_mask,velec);
859 velecsum = _mm_add_ps(velecsum,velec);
860 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
861 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
863 fscal = _mm_add_ps(felec,fvdw);
865 fscal = _mm_andnot_ps(dummy_mask,fscal);
867 /* Calculate temporary vectorial force */
868 tx = _mm_mul_ps(fscal,dx00);
869 ty = _mm_mul_ps(fscal,dy00);
870 tz = _mm_mul_ps(fscal,dz00);
872 /* Update vectorial force */
873 fix0 = _mm_add_ps(fix0,tx);
874 fiy0 = _mm_add_ps(fiy0,ty);
875 fiz0 = _mm_add_ps(fiz0,tz);
877 fjx0 = _mm_add_ps(fjx0,tx);
878 fjy0 = _mm_add_ps(fjy0,ty);
879 fjz0 = _mm_add_ps(fjz0,tz);
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 r01 = _mm_mul_ps(rsq01,rinv01);
886 r01 = _mm_andnot_ps(dummy_mask,r01);
888 /* Calculate table index by multiplying r with table scale and truncate to integer */
889 rt = _mm_mul_ps(r01,vftabscale);
890 vfitab = _mm_cvttps_epi32(rt);
891 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
892 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
894 /* CUBIC SPLINE TABLE ELECTROSTATICS */
895 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
896 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
897 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
898 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
899 _MM_TRANSPOSE4_PS(Y,F,G,H);
900 Heps = _mm_mul_ps(vfeps,H);
901 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
902 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
903 velec = _mm_mul_ps(qq01,VV);
904 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
905 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
907 /* Update potential sum for this i atom from the interaction with this j atom. */
908 velec = _mm_andnot_ps(dummy_mask,velec);
909 velecsum = _mm_add_ps(velecsum,velec);
913 fscal = _mm_andnot_ps(dummy_mask,fscal);
915 /* Calculate temporary vectorial force */
916 tx = _mm_mul_ps(fscal,dx01);
917 ty = _mm_mul_ps(fscal,dy01);
918 tz = _mm_mul_ps(fscal,dz01);
920 /* Update vectorial force */
921 fix0 = _mm_add_ps(fix0,tx);
922 fiy0 = _mm_add_ps(fiy0,ty);
923 fiz0 = _mm_add_ps(fiz0,tz);
925 fjx1 = _mm_add_ps(fjx1,tx);
926 fjy1 = _mm_add_ps(fjy1,ty);
927 fjz1 = _mm_add_ps(fjz1,tz);
929 /**************************
930 * CALCULATE INTERACTIONS *
931 **************************/
933 r02 = _mm_mul_ps(rsq02,rinv02);
934 r02 = _mm_andnot_ps(dummy_mask,r02);
936 /* Calculate table index by multiplying r with table scale and truncate to integer */
937 rt = _mm_mul_ps(r02,vftabscale);
938 vfitab = _mm_cvttps_epi32(rt);
939 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
940 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
942 /* CUBIC SPLINE TABLE ELECTROSTATICS */
943 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
944 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
945 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
946 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
947 _MM_TRANSPOSE4_PS(Y,F,G,H);
948 Heps = _mm_mul_ps(vfeps,H);
949 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
950 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
951 velec = _mm_mul_ps(qq02,VV);
952 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
953 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
955 /* Update potential sum for this i atom from the interaction with this j atom. */
956 velec = _mm_andnot_ps(dummy_mask,velec);
957 velecsum = _mm_add_ps(velecsum,velec);
961 fscal = _mm_andnot_ps(dummy_mask,fscal);
963 /* Calculate temporary vectorial force */
964 tx = _mm_mul_ps(fscal,dx02);
965 ty = _mm_mul_ps(fscal,dy02);
966 tz = _mm_mul_ps(fscal,dz02);
968 /* Update vectorial force */
969 fix0 = _mm_add_ps(fix0,tx);
970 fiy0 = _mm_add_ps(fiy0,ty);
971 fiz0 = _mm_add_ps(fiz0,tz);
973 fjx2 = _mm_add_ps(fjx2,tx);
974 fjy2 = _mm_add_ps(fjy2,ty);
975 fjz2 = _mm_add_ps(fjz2,tz);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 r10 = _mm_mul_ps(rsq10,rinv10);
982 r10 = _mm_andnot_ps(dummy_mask,r10);
984 /* Calculate table index by multiplying r with table scale and truncate to integer */
985 rt = _mm_mul_ps(r10,vftabscale);
986 vfitab = _mm_cvttps_epi32(rt);
987 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
988 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
990 /* CUBIC SPLINE TABLE ELECTROSTATICS */
991 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
992 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
993 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
994 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
995 _MM_TRANSPOSE4_PS(Y,F,G,H);
996 Heps = _mm_mul_ps(vfeps,H);
997 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
998 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
999 velec = _mm_mul_ps(qq10,VV);
1000 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1001 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1003 /* Update potential sum for this i atom from the interaction with this j atom. */
1004 velec = _mm_andnot_ps(dummy_mask,velec);
1005 velecsum = _mm_add_ps(velecsum,velec);
1009 fscal = _mm_andnot_ps(dummy_mask,fscal);
1011 /* Calculate temporary vectorial force */
1012 tx = _mm_mul_ps(fscal,dx10);
1013 ty = _mm_mul_ps(fscal,dy10);
1014 tz = _mm_mul_ps(fscal,dz10);
1016 /* Update vectorial force */
1017 fix1 = _mm_add_ps(fix1,tx);
1018 fiy1 = _mm_add_ps(fiy1,ty);
1019 fiz1 = _mm_add_ps(fiz1,tz);
1021 fjx0 = _mm_add_ps(fjx0,tx);
1022 fjy0 = _mm_add_ps(fjy0,ty);
1023 fjz0 = _mm_add_ps(fjz0,tz);
1025 /**************************
1026 * CALCULATE INTERACTIONS *
1027 **************************/
1029 r11 = _mm_mul_ps(rsq11,rinv11);
1030 r11 = _mm_andnot_ps(dummy_mask,r11);
1032 /* Calculate table index by multiplying r with table scale and truncate to integer */
1033 rt = _mm_mul_ps(r11,vftabscale);
1034 vfitab = _mm_cvttps_epi32(rt);
1035 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1036 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1038 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1039 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1040 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1041 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1042 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1043 _MM_TRANSPOSE4_PS(Y,F,G,H);
1044 Heps = _mm_mul_ps(vfeps,H);
1045 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1046 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1047 velec = _mm_mul_ps(qq11,VV);
1048 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1049 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm_andnot_ps(dummy_mask,velec);
1053 velecsum = _mm_add_ps(velecsum,velec);
1057 fscal = _mm_andnot_ps(dummy_mask,fscal);
1059 /* Calculate temporary vectorial force */
1060 tx = _mm_mul_ps(fscal,dx11);
1061 ty = _mm_mul_ps(fscal,dy11);
1062 tz = _mm_mul_ps(fscal,dz11);
1064 /* Update vectorial force */
1065 fix1 = _mm_add_ps(fix1,tx);
1066 fiy1 = _mm_add_ps(fiy1,ty);
1067 fiz1 = _mm_add_ps(fiz1,tz);
1069 fjx1 = _mm_add_ps(fjx1,tx);
1070 fjy1 = _mm_add_ps(fjy1,ty);
1071 fjz1 = _mm_add_ps(fjz1,tz);
1073 /**************************
1074 * CALCULATE INTERACTIONS *
1075 **************************/
1077 r12 = _mm_mul_ps(rsq12,rinv12);
1078 r12 = _mm_andnot_ps(dummy_mask,r12);
1080 /* Calculate table index by multiplying r with table scale and truncate to integer */
1081 rt = _mm_mul_ps(r12,vftabscale);
1082 vfitab = _mm_cvttps_epi32(rt);
1083 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1084 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1086 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1087 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1088 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1089 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1090 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1091 _MM_TRANSPOSE4_PS(Y,F,G,H);
1092 Heps = _mm_mul_ps(vfeps,H);
1093 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1094 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1095 velec = _mm_mul_ps(qq12,VV);
1096 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1097 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm_andnot_ps(dummy_mask,velec);
1101 velecsum = _mm_add_ps(velecsum,velec);
1105 fscal = _mm_andnot_ps(dummy_mask,fscal);
1107 /* Calculate temporary vectorial force */
1108 tx = _mm_mul_ps(fscal,dx12);
1109 ty = _mm_mul_ps(fscal,dy12);
1110 tz = _mm_mul_ps(fscal,dz12);
1112 /* Update vectorial force */
1113 fix1 = _mm_add_ps(fix1,tx);
1114 fiy1 = _mm_add_ps(fiy1,ty);
1115 fiz1 = _mm_add_ps(fiz1,tz);
1117 fjx2 = _mm_add_ps(fjx2,tx);
1118 fjy2 = _mm_add_ps(fjy2,ty);
1119 fjz2 = _mm_add_ps(fjz2,tz);
1121 /**************************
1122 * CALCULATE INTERACTIONS *
1123 **************************/
1125 r20 = _mm_mul_ps(rsq20,rinv20);
1126 r20 = _mm_andnot_ps(dummy_mask,r20);
1128 /* Calculate table index by multiplying r with table scale and truncate to integer */
1129 rt = _mm_mul_ps(r20,vftabscale);
1130 vfitab = _mm_cvttps_epi32(rt);
1131 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1132 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1134 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1135 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1136 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1137 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1138 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1139 _MM_TRANSPOSE4_PS(Y,F,G,H);
1140 Heps = _mm_mul_ps(vfeps,H);
1141 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1142 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1143 velec = _mm_mul_ps(qq20,VV);
1144 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1145 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1147 /* Update potential sum for this i atom from the interaction with this j atom. */
1148 velec = _mm_andnot_ps(dummy_mask,velec);
1149 velecsum = _mm_add_ps(velecsum,velec);
1153 fscal = _mm_andnot_ps(dummy_mask,fscal);
1155 /* Calculate temporary vectorial force */
1156 tx = _mm_mul_ps(fscal,dx20);
1157 ty = _mm_mul_ps(fscal,dy20);
1158 tz = _mm_mul_ps(fscal,dz20);
1160 /* Update vectorial force */
1161 fix2 = _mm_add_ps(fix2,tx);
1162 fiy2 = _mm_add_ps(fiy2,ty);
1163 fiz2 = _mm_add_ps(fiz2,tz);
1165 fjx0 = _mm_add_ps(fjx0,tx);
1166 fjy0 = _mm_add_ps(fjy0,ty);
1167 fjz0 = _mm_add_ps(fjz0,tz);
1169 /**************************
1170 * CALCULATE INTERACTIONS *
1171 **************************/
1173 r21 = _mm_mul_ps(rsq21,rinv21);
1174 r21 = _mm_andnot_ps(dummy_mask,r21);
1176 /* Calculate table index by multiplying r with table scale and truncate to integer */
1177 rt = _mm_mul_ps(r21,vftabscale);
1178 vfitab = _mm_cvttps_epi32(rt);
1179 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1180 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1182 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1183 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1184 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1185 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1186 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1187 _MM_TRANSPOSE4_PS(Y,F,G,H);
1188 Heps = _mm_mul_ps(vfeps,H);
1189 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1190 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1191 velec = _mm_mul_ps(qq21,VV);
1192 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1193 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1195 /* Update potential sum for this i atom from the interaction with this j atom. */
1196 velec = _mm_andnot_ps(dummy_mask,velec);
1197 velecsum = _mm_add_ps(velecsum,velec);
1201 fscal = _mm_andnot_ps(dummy_mask,fscal);
1203 /* Calculate temporary vectorial force */
1204 tx = _mm_mul_ps(fscal,dx21);
1205 ty = _mm_mul_ps(fscal,dy21);
1206 tz = _mm_mul_ps(fscal,dz21);
1208 /* Update vectorial force */
1209 fix2 = _mm_add_ps(fix2,tx);
1210 fiy2 = _mm_add_ps(fiy2,ty);
1211 fiz2 = _mm_add_ps(fiz2,tz);
1213 fjx1 = _mm_add_ps(fjx1,tx);
1214 fjy1 = _mm_add_ps(fjy1,ty);
1215 fjz1 = _mm_add_ps(fjz1,tz);
1217 /**************************
1218 * CALCULATE INTERACTIONS *
1219 **************************/
1221 r22 = _mm_mul_ps(rsq22,rinv22);
1222 r22 = _mm_andnot_ps(dummy_mask,r22);
1224 /* Calculate table index by multiplying r with table scale and truncate to integer */
1225 rt = _mm_mul_ps(r22,vftabscale);
1226 vfitab = _mm_cvttps_epi32(rt);
1227 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1228 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1230 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1231 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1232 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1233 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1234 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1235 _MM_TRANSPOSE4_PS(Y,F,G,H);
1236 Heps = _mm_mul_ps(vfeps,H);
1237 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1238 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1239 velec = _mm_mul_ps(qq22,VV);
1240 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1241 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1243 /* Update potential sum for this i atom from the interaction with this j atom. */
1244 velec = _mm_andnot_ps(dummy_mask,velec);
1245 velecsum = _mm_add_ps(velecsum,velec);
1249 fscal = _mm_andnot_ps(dummy_mask,fscal);
1251 /* Calculate temporary vectorial force */
1252 tx = _mm_mul_ps(fscal,dx22);
1253 ty = _mm_mul_ps(fscal,dy22);
1254 tz = _mm_mul_ps(fscal,dz22);
1256 /* Update vectorial force */
1257 fix2 = _mm_add_ps(fix2,tx);
1258 fiy2 = _mm_add_ps(fiy2,ty);
1259 fiz2 = _mm_add_ps(fiz2,tz);
1261 fjx2 = _mm_add_ps(fjx2,tx);
1262 fjy2 = _mm_add_ps(fjy2,ty);
1263 fjz2 = _mm_add_ps(fjz2,tz);
1265 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1266 f+j_coord_offsetC,f+j_coord_offsetD,
1267 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1269 /* Inner loop uses 426 flops */
1272 /* End of innermost loop */
1274 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1275 f+i_coord_offset,fshift+i_shift_offset);
1278 /* Update potential energies */
1279 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1280 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1282 /* Increment number of inner iterations */
1283 inneriter += j_index_end - j_index_start;
1285 /* Outer loop uses 29 flops */
1288 /* Increment number of outer iterations */
1291 /* Update outer/inner flops */
1293 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*29 + inneriter*426);
1296 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1297 * Electrostatics interaction: CubicSplineTable
1298 * VdW interaction: CubicSplineTable
1299 * Geometry: Water3-Water3
1300 * Calculate force/pot: Force
1303 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1304 (t_nblist * gmx_restrict nlist,
1305 rvec * gmx_restrict xx,
1306 rvec * gmx_restrict ff,
1307 t_forcerec * gmx_restrict fr,
1308 t_mdatoms * gmx_restrict mdatoms,
1309 nb_kernel_data_t * gmx_restrict kernel_data,
1310 t_nrnb * gmx_restrict nrnb)
1312 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1313 * just 0 for non-waters.
1314 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1315 * jnr indices corresponding to data put in the four positions in the SIMD register.
1317 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1318 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1319 int jnrA,jnrB,jnrC,jnrD;
1320 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1321 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1322 real shX,shY,shZ,rcutoff_scalar;
1323 real *shiftvec,*fshift,*x,*f;
1324 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1326 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1328 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1330 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1331 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1332 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1333 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1334 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1335 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1336 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1337 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1338 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1339 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1340 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1341 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1342 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1343 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1344 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1345 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1346 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1349 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1352 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1353 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1355 __m128i ifour = _mm_set1_epi32(4);
1356 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1358 __m128 dummy_mask,cutoff_mask;
1359 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1360 __m128 one = _mm_set1_ps(1.0);
1361 __m128 two = _mm_set1_ps(2.0);
1367 jindex = nlist->jindex;
1369 shiftidx = nlist->shift;
1371 shiftvec = fr->shift_vec[0];
1372 fshift = fr->fshift[0];
1373 facel = _mm_set1_ps(fr->epsfac);
1374 charge = mdatoms->chargeA;
1375 nvdwtype = fr->ntype;
1376 vdwparam = fr->nbfp;
1377 vdwtype = mdatoms->typeA;
1379 vftab = kernel_data->table_elec_vdw->data;
1380 vftabscale = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1382 /* Setup water-specific parameters */
1383 inr = nlist->iinr[0];
1384 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1385 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1386 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1387 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1389 jq0 = _mm_set1_ps(charge[inr+0]);
1390 jq1 = _mm_set1_ps(charge[inr+1]);
1391 jq2 = _mm_set1_ps(charge[inr+2]);
1392 vdwjidx0A = 2*vdwtype[inr+0];
1393 qq00 = _mm_mul_ps(iq0,jq0);
1394 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1395 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1396 qq01 = _mm_mul_ps(iq0,jq1);
1397 qq02 = _mm_mul_ps(iq0,jq2);
1398 qq10 = _mm_mul_ps(iq1,jq0);
1399 qq11 = _mm_mul_ps(iq1,jq1);
1400 qq12 = _mm_mul_ps(iq1,jq2);
1401 qq20 = _mm_mul_ps(iq2,jq0);
1402 qq21 = _mm_mul_ps(iq2,jq1);
1403 qq22 = _mm_mul_ps(iq2,jq2);
1405 /* Avoid stupid compiler warnings */
1406 jnrA = jnrB = jnrC = jnrD = 0;
1407 j_coord_offsetA = 0;
1408 j_coord_offsetB = 0;
1409 j_coord_offsetC = 0;
1410 j_coord_offsetD = 0;
1415 /* Start outer loop over neighborlists */
1416 for(iidx=0; iidx<nri; iidx++)
1418 /* Load shift vector for this list */
1419 i_shift_offset = DIM*shiftidx[iidx];
1420 shX = shiftvec[i_shift_offset+XX];
1421 shY = shiftvec[i_shift_offset+YY];
1422 shZ = shiftvec[i_shift_offset+ZZ];
1424 /* Load limits for loop over neighbors */
1425 j_index_start = jindex[iidx];
1426 j_index_end = jindex[iidx+1];
1428 /* Get outer coordinate index */
1430 i_coord_offset = DIM*inr;
1432 /* Load i particle coords and add shift vector */
1433 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1434 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1435 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1436 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1437 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1438 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1439 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1440 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1441 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1443 fix0 = _mm_setzero_ps();
1444 fiy0 = _mm_setzero_ps();
1445 fiz0 = _mm_setzero_ps();
1446 fix1 = _mm_setzero_ps();
1447 fiy1 = _mm_setzero_ps();
1448 fiz1 = _mm_setzero_ps();
1449 fix2 = _mm_setzero_ps();
1450 fiy2 = _mm_setzero_ps();
1451 fiz2 = _mm_setzero_ps();
1453 /* Start inner kernel loop */
1454 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1457 /* Get j neighbor index, and coordinate index */
1459 jnrB = jjnr[jidx+1];
1460 jnrC = jjnr[jidx+2];
1461 jnrD = jjnr[jidx+3];
1463 j_coord_offsetA = DIM*jnrA;
1464 j_coord_offsetB = DIM*jnrB;
1465 j_coord_offsetC = DIM*jnrC;
1466 j_coord_offsetD = DIM*jnrD;
1468 /* load j atom coordinates */
1469 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1470 x+j_coord_offsetC,x+j_coord_offsetD,
1471 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1473 /* Calculate displacement vector */
1474 dx00 = _mm_sub_ps(ix0,jx0);
1475 dy00 = _mm_sub_ps(iy0,jy0);
1476 dz00 = _mm_sub_ps(iz0,jz0);
1477 dx01 = _mm_sub_ps(ix0,jx1);
1478 dy01 = _mm_sub_ps(iy0,jy1);
1479 dz01 = _mm_sub_ps(iz0,jz1);
1480 dx02 = _mm_sub_ps(ix0,jx2);
1481 dy02 = _mm_sub_ps(iy0,jy2);
1482 dz02 = _mm_sub_ps(iz0,jz2);
1483 dx10 = _mm_sub_ps(ix1,jx0);
1484 dy10 = _mm_sub_ps(iy1,jy0);
1485 dz10 = _mm_sub_ps(iz1,jz0);
1486 dx11 = _mm_sub_ps(ix1,jx1);
1487 dy11 = _mm_sub_ps(iy1,jy1);
1488 dz11 = _mm_sub_ps(iz1,jz1);
1489 dx12 = _mm_sub_ps(ix1,jx2);
1490 dy12 = _mm_sub_ps(iy1,jy2);
1491 dz12 = _mm_sub_ps(iz1,jz2);
1492 dx20 = _mm_sub_ps(ix2,jx0);
1493 dy20 = _mm_sub_ps(iy2,jy0);
1494 dz20 = _mm_sub_ps(iz2,jz0);
1495 dx21 = _mm_sub_ps(ix2,jx1);
1496 dy21 = _mm_sub_ps(iy2,jy1);
1497 dz21 = _mm_sub_ps(iz2,jz1);
1498 dx22 = _mm_sub_ps(ix2,jx2);
1499 dy22 = _mm_sub_ps(iy2,jy2);
1500 dz22 = _mm_sub_ps(iz2,jz2);
1502 /* Calculate squared distance and things based on it */
1503 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1504 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1505 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1506 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1507 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1508 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1509 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1510 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1511 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1513 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1514 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1515 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1516 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1517 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1518 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1519 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1520 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1521 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1523 fjx0 = _mm_setzero_ps();
1524 fjy0 = _mm_setzero_ps();
1525 fjz0 = _mm_setzero_ps();
1526 fjx1 = _mm_setzero_ps();
1527 fjy1 = _mm_setzero_ps();
1528 fjz1 = _mm_setzero_ps();
1529 fjx2 = _mm_setzero_ps();
1530 fjy2 = _mm_setzero_ps();
1531 fjz2 = _mm_setzero_ps();
1533 /**************************
1534 * CALCULATE INTERACTIONS *
1535 **************************/
1537 r00 = _mm_mul_ps(rsq00,rinv00);
1539 /* Calculate table index by multiplying r with table scale and truncate to integer */
1540 rt = _mm_mul_ps(r00,vftabscale);
1541 vfitab = _mm_cvttps_epi32(rt);
1542 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1543 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1545 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1546 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1547 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1548 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1549 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1550 _MM_TRANSPOSE4_PS(Y,F,G,H);
1551 Heps = _mm_mul_ps(vfeps,H);
1552 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1553 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1554 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1556 /* CUBIC SPLINE TABLE DISPERSION */
1557 vfitab = _mm_add_epi32(vfitab,ifour);
1558 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1559 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1560 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1561 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1562 _MM_TRANSPOSE4_PS(Y,F,G,H);
1563 Heps = _mm_mul_ps(vfeps,H);
1564 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1565 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1566 fvdw6 = _mm_mul_ps(c6_00,FF);
1568 /* CUBIC SPLINE TABLE REPULSION */
1569 vfitab = _mm_add_epi32(vfitab,ifour);
1570 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1571 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1572 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1573 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1574 _MM_TRANSPOSE4_PS(Y,F,G,H);
1575 Heps = _mm_mul_ps(vfeps,H);
1576 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1577 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1578 fvdw12 = _mm_mul_ps(c12_00,FF);
1579 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1581 fscal = _mm_add_ps(felec,fvdw);
1583 /* Calculate temporary vectorial force */
1584 tx = _mm_mul_ps(fscal,dx00);
1585 ty = _mm_mul_ps(fscal,dy00);
1586 tz = _mm_mul_ps(fscal,dz00);
1588 /* Update vectorial force */
1589 fix0 = _mm_add_ps(fix0,tx);
1590 fiy0 = _mm_add_ps(fiy0,ty);
1591 fiz0 = _mm_add_ps(fiz0,tz);
1593 fjx0 = _mm_add_ps(fjx0,tx);
1594 fjy0 = _mm_add_ps(fjy0,ty);
1595 fjz0 = _mm_add_ps(fjz0,tz);
1597 /**************************
1598 * CALCULATE INTERACTIONS *
1599 **************************/
1601 r01 = _mm_mul_ps(rsq01,rinv01);
1603 /* Calculate table index by multiplying r with table scale and truncate to integer */
1604 rt = _mm_mul_ps(r01,vftabscale);
1605 vfitab = _mm_cvttps_epi32(rt);
1606 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1607 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1609 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1610 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1611 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1612 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1613 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1614 _MM_TRANSPOSE4_PS(Y,F,G,H);
1615 Heps = _mm_mul_ps(vfeps,H);
1616 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1617 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1618 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1622 /* Calculate temporary vectorial force */
1623 tx = _mm_mul_ps(fscal,dx01);
1624 ty = _mm_mul_ps(fscal,dy01);
1625 tz = _mm_mul_ps(fscal,dz01);
1627 /* Update vectorial force */
1628 fix0 = _mm_add_ps(fix0,tx);
1629 fiy0 = _mm_add_ps(fiy0,ty);
1630 fiz0 = _mm_add_ps(fiz0,tz);
1632 fjx1 = _mm_add_ps(fjx1,tx);
1633 fjy1 = _mm_add_ps(fjy1,ty);
1634 fjz1 = _mm_add_ps(fjz1,tz);
1636 /**************************
1637 * CALCULATE INTERACTIONS *
1638 **************************/
1640 r02 = _mm_mul_ps(rsq02,rinv02);
1642 /* Calculate table index by multiplying r with table scale and truncate to integer */
1643 rt = _mm_mul_ps(r02,vftabscale);
1644 vfitab = _mm_cvttps_epi32(rt);
1645 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1646 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1648 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1649 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1650 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1651 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1652 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1653 _MM_TRANSPOSE4_PS(Y,F,G,H);
1654 Heps = _mm_mul_ps(vfeps,H);
1655 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1656 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1657 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1661 /* Calculate temporary vectorial force */
1662 tx = _mm_mul_ps(fscal,dx02);
1663 ty = _mm_mul_ps(fscal,dy02);
1664 tz = _mm_mul_ps(fscal,dz02);
1666 /* Update vectorial force */
1667 fix0 = _mm_add_ps(fix0,tx);
1668 fiy0 = _mm_add_ps(fiy0,ty);
1669 fiz0 = _mm_add_ps(fiz0,tz);
1671 fjx2 = _mm_add_ps(fjx2,tx);
1672 fjy2 = _mm_add_ps(fjy2,ty);
1673 fjz2 = _mm_add_ps(fjz2,tz);
1675 /**************************
1676 * CALCULATE INTERACTIONS *
1677 **************************/
1679 r10 = _mm_mul_ps(rsq10,rinv10);
1681 /* Calculate table index by multiplying r with table scale and truncate to integer */
1682 rt = _mm_mul_ps(r10,vftabscale);
1683 vfitab = _mm_cvttps_epi32(rt);
1684 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1685 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1687 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1688 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1689 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1690 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1691 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1692 _MM_TRANSPOSE4_PS(Y,F,G,H);
1693 Heps = _mm_mul_ps(vfeps,H);
1694 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1695 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1696 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1700 /* Calculate temporary vectorial force */
1701 tx = _mm_mul_ps(fscal,dx10);
1702 ty = _mm_mul_ps(fscal,dy10);
1703 tz = _mm_mul_ps(fscal,dz10);
1705 /* Update vectorial force */
1706 fix1 = _mm_add_ps(fix1,tx);
1707 fiy1 = _mm_add_ps(fiy1,ty);
1708 fiz1 = _mm_add_ps(fiz1,tz);
1710 fjx0 = _mm_add_ps(fjx0,tx);
1711 fjy0 = _mm_add_ps(fjy0,ty);
1712 fjz0 = _mm_add_ps(fjz0,tz);
1714 /**************************
1715 * CALCULATE INTERACTIONS *
1716 **************************/
1718 r11 = _mm_mul_ps(rsq11,rinv11);
1720 /* Calculate table index by multiplying r with table scale and truncate to integer */
1721 rt = _mm_mul_ps(r11,vftabscale);
1722 vfitab = _mm_cvttps_epi32(rt);
1723 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1724 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1726 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1727 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1728 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1729 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1730 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1731 _MM_TRANSPOSE4_PS(Y,F,G,H);
1732 Heps = _mm_mul_ps(vfeps,H);
1733 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1734 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1735 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1739 /* Calculate temporary vectorial force */
1740 tx = _mm_mul_ps(fscal,dx11);
1741 ty = _mm_mul_ps(fscal,dy11);
1742 tz = _mm_mul_ps(fscal,dz11);
1744 /* Update vectorial force */
1745 fix1 = _mm_add_ps(fix1,tx);
1746 fiy1 = _mm_add_ps(fiy1,ty);
1747 fiz1 = _mm_add_ps(fiz1,tz);
1749 fjx1 = _mm_add_ps(fjx1,tx);
1750 fjy1 = _mm_add_ps(fjy1,ty);
1751 fjz1 = _mm_add_ps(fjz1,tz);
1753 /**************************
1754 * CALCULATE INTERACTIONS *
1755 **************************/
1757 r12 = _mm_mul_ps(rsq12,rinv12);
1759 /* Calculate table index by multiplying r with table scale and truncate to integer */
1760 rt = _mm_mul_ps(r12,vftabscale);
1761 vfitab = _mm_cvttps_epi32(rt);
1762 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1763 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1765 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1766 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1767 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1768 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1769 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1770 _MM_TRANSPOSE4_PS(Y,F,G,H);
1771 Heps = _mm_mul_ps(vfeps,H);
1772 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1773 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1774 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1778 /* Calculate temporary vectorial force */
1779 tx = _mm_mul_ps(fscal,dx12);
1780 ty = _mm_mul_ps(fscal,dy12);
1781 tz = _mm_mul_ps(fscal,dz12);
1783 /* Update vectorial force */
1784 fix1 = _mm_add_ps(fix1,tx);
1785 fiy1 = _mm_add_ps(fiy1,ty);
1786 fiz1 = _mm_add_ps(fiz1,tz);
1788 fjx2 = _mm_add_ps(fjx2,tx);
1789 fjy2 = _mm_add_ps(fjy2,ty);
1790 fjz2 = _mm_add_ps(fjz2,tz);
1792 /**************************
1793 * CALCULATE INTERACTIONS *
1794 **************************/
1796 r20 = _mm_mul_ps(rsq20,rinv20);
1798 /* Calculate table index by multiplying r with table scale and truncate to integer */
1799 rt = _mm_mul_ps(r20,vftabscale);
1800 vfitab = _mm_cvttps_epi32(rt);
1801 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1802 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1804 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1805 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1806 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1807 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1808 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1809 _MM_TRANSPOSE4_PS(Y,F,G,H);
1810 Heps = _mm_mul_ps(vfeps,H);
1811 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1812 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1813 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1817 /* Calculate temporary vectorial force */
1818 tx = _mm_mul_ps(fscal,dx20);
1819 ty = _mm_mul_ps(fscal,dy20);
1820 tz = _mm_mul_ps(fscal,dz20);
1822 /* Update vectorial force */
1823 fix2 = _mm_add_ps(fix2,tx);
1824 fiy2 = _mm_add_ps(fiy2,ty);
1825 fiz2 = _mm_add_ps(fiz2,tz);
1827 fjx0 = _mm_add_ps(fjx0,tx);
1828 fjy0 = _mm_add_ps(fjy0,ty);
1829 fjz0 = _mm_add_ps(fjz0,tz);
1831 /**************************
1832 * CALCULATE INTERACTIONS *
1833 **************************/
1835 r21 = _mm_mul_ps(rsq21,rinv21);
1837 /* Calculate table index by multiplying r with table scale and truncate to integer */
1838 rt = _mm_mul_ps(r21,vftabscale);
1839 vfitab = _mm_cvttps_epi32(rt);
1840 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1841 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1843 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1844 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1845 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1846 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1847 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1848 _MM_TRANSPOSE4_PS(Y,F,G,H);
1849 Heps = _mm_mul_ps(vfeps,H);
1850 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1851 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1852 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1856 /* Calculate temporary vectorial force */
1857 tx = _mm_mul_ps(fscal,dx21);
1858 ty = _mm_mul_ps(fscal,dy21);
1859 tz = _mm_mul_ps(fscal,dz21);
1861 /* Update vectorial force */
1862 fix2 = _mm_add_ps(fix2,tx);
1863 fiy2 = _mm_add_ps(fiy2,ty);
1864 fiz2 = _mm_add_ps(fiz2,tz);
1866 fjx1 = _mm_add_ps(fjx1,tx);
1867 fjy1 = _mm_add_ps(fjy1,ty);
1868 fjz1 = _mm_add_ps(fjz1,tz);
1870 /**************************
1871 * CALCULATE INTERACTIONS *
1872 **************************/
1874 r22 = _mm_mul_ps(rsq22,rinv22);
1876 /* Calculate table index by multiplying r with table scale and truncate to integer */
1877 rt = _mm_mul_ps(r22,vftabscale);
1878 vfitab = _mm_cvttps_epi32(rt);
1879 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1880 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1883 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1884 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1885 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1886 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1887 _MM_TRANSPOSE4_PS(Y,F,G,H);
1888 Heps = _mm_mul_ps(vfeps,H);
1889 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1890 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1891 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1895 /* Calculate temporary vectorial force */
1896 tx = _mm_mul_ps(fscal,dx22);
1897 ty = _mm_mul_ps(fscal,dy22);
1898 tz = _mm_mul_ps(fscal,dz22);
1900 /* Update vectorial force */
1901 fix2 = _mm_add_ps(fix2,tx);
1902 fiy2 = _mm_add_ps(fiy2,ty);
1903 fiz2 = _mm_add_ps(fiz2,tz);
1905 fjx2 = _mm_add_ps(fjx2,tx);
1906 fjy2 = _mm_add_ps(fjy2,ty);
1907 fjz2 = _mm_add_ps(fjz2,tz);
1909 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1910 f+j_coord_offsetC,f+j_coord_offsetD,
1911 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1913 /* Inner loop uses 373 flops */
1916 if(jidx<j_index_end)
1919 /* Get j neighbor index, and coordinate index */
1921 jnrB = jjnr[jidx+1];
1922 jnrC = jjnr[jidx+2];
1923 jnrD = jjnr[jidx+3];
1925 /* Sign of each element will be negative for non-real atoms.
1926 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1927 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1929 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1930 jnrA = (jnrA>=0) ? jnrA : 0;
1931 jnrB = (jnrB>=0) ? jnrB : 0;
1932 jnrC = (jnrC>=0) ? jnrC : 0;
1933 jnrD = (jnrD>=0) ? jnrD : 0;
1935 j_coord_offsetA = DIM*jnrA;
1936 j_coord_offsetB = DIM*jnrB;
1937 j_coord_offsetC = DIM*jnrC;
1938 j_coord_offsetD = DIM*jnrD;
1940 /* load j atom coordinates */
1941 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1942 x+j_coord_offsetC,x+j_coord_offsetD,
1943 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1945 /* Calculate displacement vector */
1946 dx00 = _mm_sub_ps(ix0,jx0);
1947 dy00 = _mm_sub_ps(iy0,jy0);
1948 dz00 = _mm_sub_ps(iz0,jz0);
1949 dx01 = _mm_sub_ps(ix0,jx1);
1950 dy01 = _mm_sub_ps(iy0,jy1);
1951 dz01 = _mm_sub_ps(iz0,jz1);
1952 dx02 = _mm_sub_ps(ix0,jx2);
1953 dy02 = _mm_sub_ps(iy0,jy2);
1954 dz02 = _mm_sub_ps(iz0,jz2);
1955 dx10 = _mm_sub_ps(ix1,jx0);
1956 dy10 = _mm_sub_ps(iy1,jy0);
1957 dz10 = _mm_sub_ps(iz1,jz0);
1958 dx11 = _mm_sub_ps(ix1,jx1);
1959 dy11 = _mm_sub_ps(iy1,jy1);
1960 dz11 = _mm_sub_ps(iz1,jz1);
1961 dx12 = _mm_sub_ps(ix1,jx2);
1962 dy12 = _mm_sub_ps(iy1,jy2);
1963 dz12 = _mm_sub_ps(iz1,jz2);
1964 dx20 = _mm_sub_ps(ix2,jx0);
1965 dy20 = _mm_sub_ps(iy2,jy0);
1966 dz20 = _mm_sub_ps(iz2,jz0);
1967 dx21 = _mm_sub_ps(ix2,jx1);
1968 dy21 = _mm_sub_ps(iy2,jy1);
1969 dz21 = _mm_sub_ps(iz2,jz1);
1970 dx22 = _mm_sub_ps(ix2,jx2);
1971 dy22 = _mm_sub_ps(iy2,jy2);
1972 dz22 = _mm_sub_ps(iz2,jz2);
1974 /* Calculate squared distance and things based on it */
1975 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1976 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1977 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1978 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1979 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1980 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1981 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1982 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1983 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1985 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1986 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1987 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1988 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1989 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1990 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1991 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1992 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1993 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1995 fjx0 = _mm_setzero_ps();
1996 fjy0 = _mm_setzero_ps();
1997 fjz0 = _mm_setzero_ps();
1998 fjx1 = _mm_setzero_ps();
1999 fjy1 = _mm_setzero_ps();
2000 fjz1 = _mm_setzero_ps();
2001 fjx2 = _mm_setzero_ps();
2002 fjy2 = _mm_setzero_ps();
2003 fjz2 = _mm_setzero_ps();
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 r00 = _mm_mul_ps(rsq00,rinv00);
2010 r00 = _mm_andnot_ps(dummy_mask,r00);
2012 /* Calculate table index by multiplying r with table scale and truncate to integer */
2013 rt = _mm_mul_ps(r00,vftabscale);
2014 vfitab = _mm_cvttps_epi32(rt);
2015 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2016 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2018 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2019 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2020 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2021 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2022 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2023 _MM_TRANSPOSE4_PS(Y,F,G,H);
2024 Heps = _mm_mul_ps(vfeps,H);
2025 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2026 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2027 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2029 /* CUBIC SPLINE TABLE DISPERSION */
2030 vfitab = _mm_add_epi32(vfitab,ifour);
2031 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2032 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2033 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2034 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2035 _MM_TRANSPOSE4_PS(Y,F,G,H);
2036 Heps = _mm_mul_ps(vfeps,H);
2037 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2038 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2039 fvdw6 = _mm_mul_ps(c6_00,FF);
2041 /* CUBIC SPLINE TABLE REPULSION */
2042 vfitab = _mm_add_epi32(vfitab,ifour);
2043 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2044 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2045 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2046 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2047 _MM_TRANSPOSE4_PS(Y,F,G,H);
2048 Heps = _mm_mul_ps(vfeps,H);
2049 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2050 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2051 fvdw12 = _mm_mul_ps(c12_00,FF);
2052 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2054 fscal = _mm_add_ps(felec,fvdw);
2056 fscal = _mm_andnot_ps(dummy_mask,fscal);
2058 /* Calculate temporary vectorial force */
2059 tx = _mm_mul_ps(fscal,dx00);
2060 ty = _mm_mul_ps(fscal,dy00);
2061 tz = _mm_mul_ps(fscal,dz00);
2063 /* Update vectorial force */
2064 fix0 = _mm_add_ps(fix0,tx);
2065 fiy0 = _mm_add_ps(fiy0,ty);
2066 fiz0 = _mm_add_ps(fiz0,tz);
2068 fjx0 = _mm_add_ps(fjx0,tx);
2069 fjy0 = _mm_add_ps(fjy0,ty);
2070 fjz0 = _mm_add_ps(fjz0,tz);
2072 /**************************
2073 * CALCULATE INTERACTIONS *
2074 **************************/
2076 r01 = _mm_mul_ps(rsq01,rinv01);
2077 r01 = _mm_andnot_ps(dummy_mask,r01);
2079 /* Calculate table index by multiplying r with table scale and truncate to integer */
2080 rt = _mm_mul_ps(r01,vftabscale);
2081 vfitab = _mm_cvttps_epi32(rt);
2082 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2083 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2085 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2086 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2087 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2088 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2089 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2090 _MM_TRANSPOSE4_PS(Y,F,G,H);
2091 Heps = _mm_mul_ps(vfeps,H);
2092 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2093 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2094 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2098 fscal = _mm_andnot_ps(dummy_mask,fscal);
2100 /* Calculate temporary vectorial force */
2101 tx = _mm_mul_ps(fscal,dx01);
2102 ty = _mm_mul_ps(fscal,dy01);
2103 tz = _mm_mul_ps(fscal,dz01);
2105 /* Update vectorial force */
2106 fix0 = _mm_add_ps(fix0,tx);
2107 fiy0 = _mm_add_ps(fiy0,ty);
2108 fiz0 = _mm_add_ps(fiz0,tz);
2110 fjx1 = _mm_add_ps(fjx1,tx);
2111 fjy1 = _mm_add_ps(fjy1,ty);
2112 fjz1 = _mm_add_ps(fjz1,tz);
2114 /**************************
2115 * CALCULATE INTERACTIONS *
2116 **************************/
2118 r02 = _mm_mul_ps(rsq02,rinv02);
2119 r02 = _mm_andnot_ps(dummy_mask,r02);
2121 /* Calculate table index by multiplying r with table scale and truncate to integer */
2122 rt = _mm_mul_ps(r02,vftabscale);
2123 vfitab = _mm_cvttps_epi32(rt);
2124 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2125 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2127 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2128 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2129 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2130 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2131 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2132 _MM_TRANSPOSE4_PS(Y,F,G,H);
2133 Heps = _mm_mul_ps(vfeps,H);
2134 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2135 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2136 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2140 fscal = _mm_andnot_ps(dummy_mask,fscal);
2142 /* Calculate temporary vectorial force */
2143 tx = _mm_mul_ps(fscal,dx02);
2144 ty = _mm_mul_ps(fscal,dy02);
2145 tz = _mm_mul_ps(fscal,dz02);
2147 /* Update vectorial force */
2148 fix0 = _mm_add_ps(fix0,tx);
2149 fiy0 = _mm_add_ps(fiy0,ty);
2150 fiz0 = _mm_add_ps(fiz0,tz);
2152 fjx2 = _mm_add_ps(fjx2,tx);
2153 fjy2 = _mm_add_ps(fjy2,ty);
2154 fjz2 = _mm_add_ps(fjz2,tz);
2156 /**************************
2157 * CALCULATE INTERACTIONS *
2158 **************************/
2160 r10 = _mm_mul_ps(rsq10,rinv10);
2161 r10 = _mm_andnot_ps(dummy_mask,r10);
2163 /* Calculate table index by multiplying r with table scale and truncate to integer */
2164 rt = _mm_mul_ps(r10,vftabscale);
2165 vfitab = _mm_cvttps_epi32(rt);
2166 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2167 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2169 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2171 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2172 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2173 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2174 _MM_TRANSPOSE4_PS(Y,F,G,H);
2175 Heps = _mm_mul_ps(vfeps,H);
2176 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2177 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2178 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2182 fscal = _mm_andnot_ps(dummy_mask,fscal);
2184 /* Calculate temporary vectorial force */
2185 tx = _mm_mul_ps(fscal,dx10);
2186 ty = _mm_mul_ps(fscal,dy10);
2187 tz = _mm_mul_ps(fscal,dz10);
2189 /* Update vectorial force */
2190 fix1 = _mm_add_ps(fix1,tx);
2191 fiy1 = _mm_add_ps(fiy1,ty);
2192 fiz1 = _mm_add_ps(fiz1,tz);
2194 fjx0 = _mm_add_ps(fjx0,tx);
2195 fjy0 = _mm_add_ps(fjy0,ty);
2196 fjz0 = _mm_add_ps(fjz0,tz);
2198 /**************************
2199 * CALCULATE INTERACTIONS *
2200 **************************/
2202 r11 = _mm_mul_ps(rsq11,rinv11);
2203 r11 = _mm_andnot_ps(dummy_mask,r11);
2205 /* Calculate table index by multiplying r with table scale and truncate to integer */
2206 rt = _mm_mul_ps(r11,vftabscale);
2207 vfitab = _mm_cvttps_epi32(rt);
2208 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2209 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2211 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2212 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2213 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2214 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2215 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2216 _MM_TRANSPOSE4_PS(Y,F,G,H);
2217 Heps = _mm_mul_ps(vfeps,H);
2218 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2219 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2220 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2224 fscal = _mm_andnot_ps(dummy_mask,fscal);
2226 /* Calculate temporary vectorial force */
2227 tx = _mm_mul_ps(fscal,dx11);
2228 ty = _mm_mul_ps(fscal,dy11);
2229 tz = _mm_mul_ps(fscal,dz11);
2231 /* Update vectorial force */
2232 fix1 = _mm_add_ps(fix1,tx);
2233 fiy1 = _mm_add_ps(fiy1,ty);
2234 fiz1 = _mm_add_ps(fiz1,tz);
2236 fjx1 = _mm_add_ps(fjx1,tx);
2237 fjy1 = _mm_add_ps(fjy1,ty);
2238 fjz1 = _mm_add_ps(fjz1,tz);
2240 /**************************
2241 * CALCULATE INTERACTIONS *
2242 **************************/
2244 r12 = _mm_mul_ps(rsq12,rinv12);
2245 r12 = _mm_andnot_ps(dummy_mask,r12);
2247 /* Calculate table index by multiplying r with table scale and truncate to integer */
2248 rt = _mm_mul_ps(r12,vftabscale);
2249 vfitab = _mm_cvttps_epi32(rt);
2250 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2251 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2253 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2254 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2255 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2256 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2257 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2258 _MM_TRANSPOSE4_PS(Y,F,G,H);
2259 Heps = _mm_mul_ps(vfeps,H);
2260 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2261 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2262 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2266 fscal = _mm_andnot_ps(dummy_mask,fscal);
2268 /* Calculate temporary vectorial force */
2269 tx = _mm_mul_ps(fscal,dx12);
2270 ty = _mm_mul_ps(fscal,dy12);
2271 tz = _mm_mul_ps(fscal,dz12);
2273 /* Update vectorial force */
2274 fix1 = _mm_add_ps(fix1,tx);
2275 fiy1 = _mm_add_ps(fiy1,ty);
2276 fiz1 = _mm_add_ps(fiz1,tz);
2278 fjx2 = _mm_add_ps(fjx2,tx);
2279 fjy2 = _mm_add_ps(fjy2,ty);
2280 fjz2 = _mm_add_ps(fjz2,tz);
2282 /**************************
2283 * CALCULATE INTERACTIONS *
2284 **************************/
2286 r20 = _mm_mul_ps(rsq20,rinv20);
2287 r20 = _mm_andnot_ps(dummy_mask,r20);
2289 /* Calculate table index by multiplying r with table scale and truncate to integer */
2290 rt = _mm_mul_ps(r20,vftabscale);
2291 vfitab = _mm_cvttps_epi32(rt);
2292 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2293 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2295 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2296 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2297 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2298 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2299 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2300 _MM_TRANSPOSE4_PS(Y,F,G,H);
2301 Heps = _mm_mul_ps(vfeps,H);
2302 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2303 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2304 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2308 fscal = _mm_andnot_ps(dummy_mask,fscal);
2310 /* Calculate temporary vectorial force */
2311 tx = _mm_mul_ps(fscal,dx20);
2312 ty = _mm_mul_ps(fscal,dy20);
2313 tz = _mm_mul_ps(fscal,dz20);
2315 /* Update vectorial force */
2316 fix2 = _mm_add_ps(fix2,tx);
2317 fiy2 = _mm_add_ps(fiy2,ty);
2318 fiz2 = _mm_add_ps(fiz2,tz);
2320 fjx0 = _mm_add_ps(fjx0,tx);
2321 fjy0 = _mm_add_ps(fjy0,ty);
2322 fjz0 = _mm_add_ps(fjz0,tz);
2324 /**************************
2325 * CALCULATE INTERACTIONS *
2326 **************************/
2328 r21 = _mm_mul_ps(rsq21,rinv21);
2329 r21 = _mm_andnot_ps(dummy_mask,r21);
2331 /* Calculate table index by multiplying r with table scale and truncate to integer */
2332 rt = _mm_mul_ps(r21,vftabscale);
2333 vfitab = _mm_cvttps_epi32(rt);
2334 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2335 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2337 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2338 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2339 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2340 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2341 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2342 _MM_TRANSPOSE4_PS(Y,F,G,H);
2343 Heps = _mm_mul_ps(vfeps,H);
2344 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2345 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2346 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2350 fscal = _mm_andnot_ps(dummy_mask,fscal);
2352 /* Calculate temporary vectorial force */
2353 tx = _mm_mul_ps(fscal,dx21);
2354 ty = _mm_mul_ps(fscal,dy21);
2355 tz = _mm_mul_ps(fscal,dz21);
2357 /* Update vectorial force */
2358 fix2 = _mm_add_ps(fix2,tx);
2359 fiy2 = _mm_add_ps(fiy2,ty);
2360 fiz2 = _mm_add_ps(fiz2,tz);
2362 fjx1 = _mm_add_ps(fjx1,tx);
2363 fjy1 = _mm_add_ps(fjy1,ty);
2364 fjz1 = _mm_add_ps(fjz1,tz);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 r22 = _mm_mul_ps(rsq22,rinv22);
2371 r22 = _mm_andnot_ps(dummy_mask,r22);
2373 /* Calculate table index by multiplying r with table scale and truncate to integer */
2374 rt = _mm_mul_ps(r22,vftabscale);
2375 vfitab = _mm_cvttps_epi32(rt);
2376 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2377 vfitab = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2379 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2380 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2381 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2382 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2383 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2384 _MM_TRANSPOSE4_PS(Y,F,G,H);
2385 Heps = _mm_mul_ps(vfeps,H);
2386 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2387 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2388 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2392 fscal = _mm_andnot_ps(dummy_mask,fscal);
2394 /* Calculate temporary vectorial force */
2395 tx = _mm_mul_ps(fscal,dx22);
2396 ty = _mm_mul_ps(fscal,dy22);
2397 tz = _mm_mul_ps(fscal,dz22);
2399 /* Update vectorial force */
2400 fix2 = _mm_add_ps(fix2,tx);
2401 fiy2 = _mm_add_ps(fiy2,ty);
2402 fiz2 = _mm_add_ps(fiz2,tz);
2404 fjx2 = _mm_add_ps(fjx2,tx);
2405 fjy2 = _mm_add_ps(fjy2,ty);
2406 fjz2 = _mm_add_ps(fjz2,tz);
2408 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2409 f+j_coord_offsetC,f+j_coord_offsetD,
2410 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2412 /* Inner loop uses 382 flops */
2415 /* End of innermost loop */
2417 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2418 f+i_coord_offset,fshift+i_shift_offset);
2420 /* Increment number of inner iterations */
2421 inneriter += j_index_end - j_index_start;
2423 /* Outer loop uses 27 flops */
2426 /* Increment number of outer iterations */
2429 /* Update outer/inner flops */
2431 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*27 + inneriter*382);