2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
93 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
97 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
99 __m128i ifour = _mm_set1_epi32(4);
100 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
102 __m128 dummy_mask,cutoff_mask;
103 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104 __m128 one = _mm_set1_ps(1.0);
105 __m128 two = _mm_set1_ps(2.0);
111 jindex = nlist->jindex;
113 shiftidx = nlist->shift;
115 shiftvec = fr->shift_vec[0];
116 fshift = fr->fshift[0];
117 facel = _mm_set1_ps(fr->epsfac);
118 charge = mdatoms->chargeA;
119 nvdwtype = fr->ntype;
121 vdwtype = mdatoms->typeA;
123 vftab = kernel_data->table_elec->data;
124 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
129 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
133 jq0 = _mm_set1_ps(charge[inr+0]);
134 jq1 = _mm_set1_ps(charge[inr+1]);
135 jq2 = _mm_set1_ps(charge[inr+2]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 qq00 = _mm_mul_ps(iq0,jq0);
138 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140 qq01 = _mm_mul_ps(iq0,jq1);
141 qq02 = _mm_mul_ps(iq0,jq2);
142 qq10 = _mm_mul_ps(iq1,jq0);
143 qq11 = _mm_mul_ps(iq1,jq1);
144 qq12 = _mm_mul_ps(iq1,jq2);
145 qq20 = _mm_mul_ps(iq2,jq0);
146 qq21 = _mm_mul_ps(iq2,jq1);
147 qq22 = _mm_mul_ps(iq2,jq2);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = jnrC = jnrD = 0;
159 for(iidx=0;iidx<4*DIM;iidx++)
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
182 fix0 = _mm_setzero_ps();
183 fiy0 = _mm_setzero_ps();
184 fiz0 = _mm_setzero_ps();
185 fix1 = _mm_setzero_ps();
186 fiy1 = _mm_setzero_ps();
187 fiz1 = _mm_setzero_ps();
188 fix2 = _mm_setzero_ps();
189 fiy2 = _mm_setzero_ps();
190 fiz2 = _mm_setzero_ps();
192 /* Reset potential sums */
193 velecsum = _mm_setzero_ps();
194 vvdwsum = _mm_setzero_ps();
196 /* Start inner kernel loop */
197 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200 /* Get j neighbor index, and coordinate index */
205 j_coord_offsetA = DIM*jnrA;
206 j_coord_offsetB = DIM*jnrB;
207 j_coord_offsetC = DIM*jnrC;
208 j_coord_offsetD = DIM*jnrD;
210 /* load j atom coordinates */
211 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212 x+j_coord_offsetC,x+j_coord_offsetD,
213 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215 /* Calculate displacement vector */
216 dx00 = _mm_sub_ps(ix0,jx0);
217 dy00 = _mm_sub_ps(iy0,jy0);
218 dz00 = _mm_sub_ps(iz0,jz0);
219 dx01 = _mm_sub_ps(ix0,jx1);
220 dy01 = _mm_sub_ps(iy0,jy1);
221 dz01 = _mm_sub_ps(iz0,jz1);
222 dx02 = _mm_sub_ps(ix0,jx2);
223 dy02 = _mm_sub_ps(iy0,jy2);
224 dz02 = _mm_sub_ps(iz0,jz2);
225 dx10 = _mm_sub_ps(ix1,jx0);
226 dy10 = _mm_sub_ps(iy1,jy0);
227 dz10 = _mm_sub_ps(iz1,jz0);
228 dx11 = _mm_sub_ps(ix1,jx1);
229 dy11 = _mm_sub_ps(iy1,jy1);
230 dz11 = _mm_sub_ps(iz1,jz1);
231 dx12 = _mm_sub_ps(ix1,jx2);
232 dy12 = _mm_sub_ps(iy1,jy2);
233 dz12 = _mm_sub_ps(iz1,jz2);
234 dx20 = _mm_sub_ps(ix2,jx0);
235 dy20 = _mm_sub_ps(iy2,jy0);
236 dz20 = _mm_sub_ps(iz2,jz0);
237 dx21 = _mm_sub_ps(ix2,jx1);
238 dy21 = _mm_sub_ps(iy2,jy1);
239 dz21 = _mm_sub_ps(iz2,jz1);
240 dx22 = _mm_sub_ps(ix2,jx2);
241 dy22 = _mm_sub_ps(iy2,jy2);
242 dz22 = _mm_sub_ps(iz2,jz2);
244 /* Calculate squared distance and things based on it */
245 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
247 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
248 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
250 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
251 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
252 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
253 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
255 rinv00 = gmx_mm_invsqrt_ps(rsq00);
256 rinv01 = gmx_mm_invsqrt_ps(rsq01);
257 rinv02 = gmx_mm_invsqrt_ps(rsq02);
258 rinv10 = gmx_mm_invsqrt_ps(rsq10);
259 rinv11 = gmx_mm_invsqrt_ps(rsq11);
260 rinv12 = gmx_mm_invsqrt_ps(rsq12);
261 rinv20 = gmx_mm_invsqrt_ps(rsq20);
262 rinv21 = gmx_mm_invsqrt_ps(rsq21);
263 rinv22 = gmx_mm_invsqrt_ps(rsq22);
265 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
267 fjx0 = _mm_setzero_ps();
268 fjy0 = _mm_setzero_ps();
269 fjz0 = _mm_setzero_ps();
270 fjx1 = _mm_setzero_ps();
271 fjy1 = _mm_setzero_ps();
272 fjz1 = _mm_setzero_ps();
273 fjx2 = _mm_setzero_ps();
274 fjy2 = _mm_setzero_ps();
275 fjz2 = _mm_setzero_ps();
277 /**************************
278 * CALCULATE INTERACTIONS *
279 **************************/
281 r00 = _mm_mul_ps(rsq00,rinv00);
283 /* Calculate table index by multiplying r with table scale and truncate to integer */
284 rt = _mm_mul_ps(r00,vftabscale);
285 vfitab = _mm_cvttps_epi32(rt);
286 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
287 vfitab = _mm_slli_epi32(vfitab,2);
289 /* CUBIC SPLINE TABLE ELECTROSTATICS */
290 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
291 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
292 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
293 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
294 _MM_TRANSPOSE4_PS(Y,F,G,H);
295 Heps = _mm_mul_ps(vfeps,H);
296 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
297 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
298 velec = _mm_mul_ps(qq00,VV);
299 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
300 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
302 /* LENNARD-JONES DISPERSION/REPULSION */
304 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
305 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
306 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
307 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
308 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
310 /* Update potential sum for this i atom from the interaction with this j atom. */
311 velecsum = _mm_add_ps(velecsum,velec);
312 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
314 fscal = _mm_add_ps(felec,fvdw);
316 /* Calculate temporary vectorial force */
317 tx = _mm_mul_ps(fscal,dx00);
318 ty = _mm_mul_ps(fscal,dy00);
319 tz = _mm_mul_ps(fscal,dz00);
321 /* Update vectorial force */
322 fix0 = _mm_add_ps(fix0,tx);
323 fiy0 = _mm_add_ps(fiy0,ty);
324 fiz0 = _mm_add_ps(fiz0,tz);
326 fjx0 = _mm_add_ps(fjx0,tx);
327 fjy0 = _mm_add_ps(fjy0,ty);
328 fjz0 = _mm_add_ps(fjz0,tz);
330 /**************************
331 * CALCULATE INTERACTIONS *
332 **************************/
334 r01 = _mm_mul_ps(rsq01,rinv01);
336 /* Calculate table index by multiplying r with table scale and truncate to integer */
337 rt = _mm_mul_ps(r01,vftabscale);
338 vfitab = _mm_cvttps_epi32(rt);
339 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
340 vfitab = _mm_slli_epi32(vfitab,2);
342 /* CUBIC SPLINE TABLE ELECTROSTATICS */
343 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
344 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
345 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
346 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
347 _MM_TRANSPOSE4_PS(Y,F,G,H);
348 Heps = _mm_mul_ps(vfeps,H);
349 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
350 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
351 velec = _mm_mul_ps(qq01,VV);
352 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
353 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
355 /* Update potential sum for this i atom from the interaction with this j atom. */
356 velecsum = _mm_add_ps(velecsum,velec);
360 /* Calculate temporary vectorial force */
361 tx = _mm_mul_ps(fscal,dx01);
362 ty = _mm_mul_ps(fscal,dy01);
363 tz = _mm_mul_ps(fscal,dz01);
365 /* Update vectorial force */
366 fix0 = _mm_add_ps(fix0,tx);
367 fiy0 = _mm_add_ps(fiy0,ty);
368 fiz0 = _mm_add_ps(fiz0,tz);
370 fjx1 = _mm_add_ps(fjx1,tx);
371 fjy1 = _mm_add_ps(fjy1,ty);
372 fjz1 = _mm_add_ps(fjz1,tz);
374 /**************************
375 * CALCULATE INTERACTIONS *
376 **************************/
378 r02 = _mm_mul_ps(rsq02,rinv02);
380 /* Calculate table index by multiplying r with table scale and truncate to integer */
381 rt = _mm_mul_ps(r02,vftabscale);
382 vfitab = _mm_cvttps_epi32(rt);
383 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
384 vfitab = _mm_slli_epi32(vfitab,2);
386 /* CUBIC SPLINE TABLE ELECTROSTATICS */
387 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
388 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
389 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
390 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
391 _MM_TRANSPOSE4_PS(Y,F,G,H);
392 Heps = _mm_mul_ps(vfeps,H);
393 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
394 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
395 velec = _mm_mul_ps(qq02,VV);
396 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
397 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
399 /* Update potential sum for this i atom from the interaction with this j atom. */
400 velecsum = _mm_add_ps(velecsum,velec);
404 /* Calculate temporary vectorial force */
405 tx = _mm_mul_ps(fscal,dx02);
406 ty = _mm_mul_ps(fscal,dy02);
407 tz = _mm_mul_ps(fscal,dz02);
409 /* Update vectorial force */
410 fix0 = _mm_add_ps(fix0,tx);
411 fiy0 = _mm_add_ps(fiy0,ty);
412 fiz0 = _mm_add_ps(fiz0,tz);
414 fjx2 = _mm_add_ps(fjx2,tx);
415 fjy2 = _mm_add_ps(fjy2,ty);
416 fjz2 = _mm_add_ps(fjz2,tz);
418 /**************************
419 * CALCULATE INTERACTIONS *
420 **************************/
422 r10 = _mm_mul_ps(rsq10,rinv10);
424 /* Calculate table index by multiplying r with table scale and truncate to integer */
425 rt = _mm_mul_ps(r10,vftabscale);
426 vfitab = _mm_cvttps_epi32(rt);
427 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
428 vfitab = _mm_slli_epi32(vfitab,2);
430 /* CUBIC SPLINE TABLE ELECTROSTATICS */
431 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
432 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
433 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
434 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
435 _MM_TRANSPOSE4_PS(Y,F,G,H);
436 Heps = _mm_mul_ps(vfeps,H);
437 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
438 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
439 velec = _mm_mul_ps(qq10,VV);
440 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
441 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velecsum = _mm_add_ps(velecsum,velec);
448 /* Calculate temporary vectorial force */
449 tx = _mm_mul_ps(fscal,dx10);
450 ty = _mm_mul_ps(fscal,dy10);
451 tz = _mm_mul_ps(fscal,dz10);
453 /* Update vectorial force */
454 fix1 = _mm_add_ps(fix1,tx);
455 fiy1 = _mm_add_ps(fiy1,ty);
456 fiz1 = _mm_add_ps(fiz1,tz);
458 fjx0 = _mm_add_ps(fjx0,tx);
459 fjy0 = _mm_add_ps(fjy0,ty);
460 fjz0 = _mm_add_ps(fjz0,tz);
462 /**************************
463 * CALCULATE INTERACTIONS *
464 **************************/
466 r11 = _mm_mul_ps(rsq11,rinv11);
468 /* Calculate table index by multiplying r with table scale and truncate to integer */
469 rt = _mm_mul_ps(r11,vftabscale);
470 vfitab = _mm_cvttps_epi32(rt);
471 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
472 vfitab = _mm_slli_epi32(vfitab,2);
474 /* CUBIC SPLINE TABLE ELECTROSTATICS */
475 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
476 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
477 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
478 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
479 _MM_TRANSPOSE4_PS(Y,F,G,H);
480 Heps = _mm_mul_ps(vfeps,H);
481 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
482 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
483 velec = _mm_mul_ps(qq11,VV);
484 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
485 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velecsum = _mm_add_ps(velecsum,velec);
492 /* Calculate temporary vectorial force */
493 tx = _mm_mul_ps(fscal,dx11);
494 ty = _mm_mul_ps(fscal,dy11);
495 tz = _mm_mul_ps(fscal,dz11);
497 /* Update vectorial force */
498 fix1 = _mm_add_ps(fix1,tx);
499 fiy1 = _mm_add_ps(fiy1,ty);
500 fiz1 = _mm_add_ps(fiz1,tz);
502 fjx1 = _mm_add_ps(fjx1,tx);
503 fjy1 = _mm_add_ps(fjy1,ty);
504 fjz1 = _mm_add_ps(fjz1,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 r12 = _mm_mul_ps(rsq12,rinv12);
512 /* Calculate table index by multiplying r with table scale and truncate to integer */
513 rt = _mm_mul_ps(r12,vftabscale);
514 vfitab = _mm_cvttps_epi32(rt);
515 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
516 vfitab = _mm_slli_epi32(vfitab,2);
518 /* CUBIC SPLINE TABLE ELECTROSTATICS */
519 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
520 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
521 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
522 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
523 _MM_TRANSPOSE4_PS(Y,F,G,H);
524 Heps = _mm_mul_ps(vfeps,H);
525 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
526 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
527 velec = _mm_mul_ps(qq12,VV);
528 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
529 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velecsum = _mm_add_ps(velecsum,velec);
536 /* Calculate temporary vectorial force */
537 tx = _mm_mul_ps(fscal,dx12);
538 ty = _mm_mul_ps(fscal,dy12);
539 tz = _mm_mul_ps(fscal,dz12);
541 /* Update vectorial force */
542 fix1 = _mm_add_ps(fix1,tx);
543 fiy1 = _mm_add_ps(fiy1,ty);
544 fiz1 = _mm_add_ps(fiz1,tz);
546 fjx2 = _mm_add_ps(fjx2,tx);
547 fjy2 = _mm_add_ps(fjy2,ty);
548 fjz2 = _mm_add_ps(fjz2,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 r20 = _mm_mul_ps(rsq20,rinv20);
556 /* Calculate table index by multiplying r with table scale and truncate to integer */
557 rt = _mm_mul_ps(r20,vftabscale);
558 vfitab = _mm_cvttps_epi32(rt);
559 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
560 vfitab = _mm_slli_epi32(vfitab,2);
562 /* CUBIC SPLINE TABLE ELECTROSTATICS */
563 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
564 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
565 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
566 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
567 _MM_TRANSPOSE4_PS(Y,F,G,H);
568 Heps = _mm_mul_ps(vfeps,H);
569 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
570 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
571 velec = _mm_mul_ps(qq20,VV);
572 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
573 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velecsum = _mm_add_ps(velecsum,velec);
580 /* Calculate temporary vectorial force */
581 tx = _mm_mul_ps(fscal,dx20);
582 ty = _mm_mul_ps(fscal,dy20);
583 tz = _mm_mul_ps(fscal,dz20);
585 /* Update vectorial force */
586 fix2 = _mm_add_ps(fix2,tx);
587 fiy2 = _mm_add_ps(fiy2,ty);
588 fiz2 = _mm_add_ps(fiz2,tz);
590 fjx0 = _mm_add_ps(fjx0,tx);
591 fjy0 = _mm_add_ps(fjy0,ty);
592 fjz0 = _mm_add_ps(fjz0,tz);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 r21 = _mm_mul_ps(rsq21,rinv21);
600 /* Calculate table index by multiplying r with table scale and truncate to integer */
601 rt = _mm_mul_ps(r21,vftabscale);
602 vfitab = _mm_cvttps_epi32(rt);
603 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
604 vfitab = _mm_slli_epi32(vfitab,2);
606 /* CUBIC SPLINE TABLE ELECTROSTATICS */
607 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
608 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
609 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
610 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
611 _MM_TRANSPOSE4_PS(Y,F,G,H);
612 Heps = _mm_mul_ps(vfeps,H);
613 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
614 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
615 velec = _mm_mul_ps(qq21,VV);
616 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
617 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
619 /* Update potential sum for this i atom from the interaction with this j atom. */
620 velecsum = _mm_add_ps(velecsum,velec);
624 /* Calculate temporary vectorial force */
625 tx = _mm_mul_ps(fscal,dx21);
626 ty = _mm_mul_ps(fscal,dy21);
627 tz = _mm_mul_ps(fscal,dz21);
629 /* Update vectorial force */
630 fix2 = _mm_add_ps(fix2,tx);
631 fiy2 = _mm_add_ps(fiy2,ty);
632 fiz2 = _mm_add_ps(fiz2,tz);
634 fjx1 = _mm_add_ps(fjx1,tx);
635 fjy1 = _mm_add_ps(fjy1,ty);
636 fjz1 = _mm_add_ps(fjz1,tz);
638 /**************************
639 * CALCULATE INTERACTIONS *
640 **************************/
642 r22 = _mm_mul_ps(rsq22,rinv22);
644 /* Calculate table index by multiplying r with table scale and truncate to integer */
645 rt = _mm_mul_ps(r22,vftabscale);
646 vfitab = _mm_cvttps_epi32(rt);
647 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
648 vfitab = _mm_slli_epi32(vfitab,2);
650 /* CUBIC SPLINE TABLE ELECTROSTATICS */
651 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
652 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
653 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
654 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
655 _MM_TRANSPOSE4_PS(Y,F,G,H);
656 Heps = _mm_mul_ps(vfeps,H);
657 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
658 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
659 velec = _mm_mul_ps(qq22,VV);
660 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
661 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum = _mm_add_ps(velecsum,velec);
668 /* Calculate temporary vectorial force */
669 tx = _mm_mul_ps(fscal,dx22);
670 ty = _mm_mul_ps(fscal,dy22);
671 tz = _mm_mul_ps(fscal,dz22);
673 /* Update vectorial force */
674 fix2 = _mm_add_ps(fix2,tx);
675 fiy2 = _mm_add_ps(fiy2,ty);
676 fiz2 = _mm_add_ps(fiz2,tz);
678 fjx2 = _mm_add_ps(fjx2,tx);
679 fjy2 = _mm_add_ps(fjy2,ty);
680 fjz2 = _mm_add_ps(fjz2,tz);
682 fjptrA = f+j_coord_offsetA;
683 fjptrB = f+j_coord_offsetB;
684 fjptrC = f+j_coord_offsetC;
685 fjptrD = f+j_coord_offsetD;
687 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
688 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
690 /* Inner loop uses 400 flops */
696 /* Get j neighbor index, and coordinate index */
697 jnrlistA = jjnr[jidx];
698 jnrlistB = jjnr[jidx+1];
699 jnrlistC = jjnr[jidx+2];
700 jnrlistD = jjnr[jidx+3];
701 /* Sign of each element will be negative for non-real atoms.
702 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
703 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
705 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
706 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
707 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
708 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
709 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
710 j_coord_offsetA = DIM*jnrA;
711 j_coord_offsetB = DIM*jnrB;
712 j_coord_offsetC = DIM*jnrC;
713 j_coord_offsetD = DIM*jnrD;
715 /* load j atom coordinates */
716 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
717 x+j_coord_offsetC,x+j_coord_offsetD,
718 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
720 /* Calculate displacement vector */
721 dx00 = _mm_sub_ps(ix0,jx0);
722 dy00 = _mm_sub_ps(iy0,jy0);
723 dz00 = _mm_sub_ps(iz0,jz0);
724 dx01 = _mm_sub_ps(ix0,jx1);
725 dy01 = _mm_sub_ps(iy0,jy1);
726 dz01 = _mm_sub_ps(iz0,jz1);
727 dx02 = _mm_sub_ps(ix0,jx2);
728 dy02 = _mm_sub_ps(iy0,jy2);
729 dz02 = _mm_sub_ps(iz0,jz2);
730 dx10 = _mm_sub_ps(ix1,jx0);
731 dy10 = _mm_sub_ps(iy1,jy0);
732 dz10 = _mm_sub_ps(iz1,jz0);
733 dx11 = _mm_sub_ps(ix1,jx1);
734 dy11 = _mm_sub_ps(iy1,jy1);
735 dz11 = _mm_sub_ps(iz1,jz1);
736 dx12 = _mm_sub_ps(ix1,jx2);
737 dy12 = _mm_sub_ps(iy1,jy2);
738 dz12 = _mm_sub_ps(iz1,jz2);
739 dx20 = _mm_sub_ps(ix2,jx0);
740 dy20 = _mm_sub_ps(iy2,jy0);
741 dz20 = _mm_sub_ps(iz2,jz0);
742 dx21 = _mm_sub_ps(ix2,jx1);
743 dy21 = _mm_sub_ps(iy2,jy1);
744 dz21 = _mm_sub_ps(iz2,jz1);
745 dx22 = _mm_sub_ps(ix2,jx2);
746 dy22 = _mm_sub_ps(iy2,jy2);
747 dz22 = _mm_sub_ps(iz2,jz2);
749 /* Calculate squared distance and things based on it */
750 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
751 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
752 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
753 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
754 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
755 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
756 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
757 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
758 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
760 rinv00 = gmx_mm_invsqrt_ps(rsq00);
761 rinv01 = gmx_mm_invsqrt_ps(rsq01);
762 rinv02 = gmx_mm_invsqrt_ps(rsq02);
763 rinv10 = gmx_mm_invsqrt_ps(rsq10);
764 rinv11 = gmx_mm_invsqrt_ps(rsq11);
765 rinv12 = gmx_mm_invsqrt_ps(rsq12);
766 rinv20 = gmx_mm_invsqrt_ps(rsq20);
767 rinv21 = gmx_mm_invsqrt_ps(rsq21);
768 rinv22 = gmx_mm_invsqrt_ps(rsq22);
770 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
772 fjx0 = _mm_setzero_ps();
773 fjy0 = _mm_setzero_ps();
774 fjz0 = _mm_setzero_ps();
775 fjx1 = _mm_setzero_ps();
776 fjy1 = _mm_setzero_ps();
777 fjz1 = _mm_setzero_ps();
778 fjx2 = _mm_setzero_ps();
779 fjy2 = _mm_setzero_ps();
780 fjz2 = _mm_setzero_ps();
782 /**************************
783 * CALCULATE INTERACTIONS *
784 **************************/
786 r00 = _mm_mul_ps(rsq00,rinv00);
787 r00 = _mm_andnot_ps(dummy_mask,r00);
789 /* Calculate table index by multiplying r with table scale and truncate to integer */
790 rt = _mm_mul_ps(r00,vftabscale);
791 vfitab = _mm_cvttps_epi32(rt);
792 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
793 vfitab = _mm_slli_epi32(vfitab,2);
795 /* CUBIC SPLINE TABLE ELECTROSTATICS */
796 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
797 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
798 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
799 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
800 _MM_TRANSPOSE4_PS(Y,F,G,H);
801 Heps = _mm_mul_ps(vfeps,H);
802 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
803 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
804 velec = _mm_mul_ps(qq00,VV);
805 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
806 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
808 /* LENNARD-JONES DISPERSION/REPULSION */
810 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
811 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
812 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
813 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
814 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
816 /* Update potential sum for this i atom from the interaction with this j atom. */
817 velec = _mm_andnot_ps(dummy_mask,velec);
818 velecsum = _mm_add_ps(velecsum,velec);
819 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
820 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
822 fscal = _mm_add_ps(felec,fvdw);
824 fscal = _mm_andnot_ps(dummy_mask,fscal);
826 /* Calculate temporary vectorial force */
827 tx = _mm_mul_ps(fscal,dx00);
828 ty = _mm_mul_ps(fscal,dy00);
829 tz = _mm_mul_ps(fscal,dz00);
831 /* Update vectorial force */
832 fix0 = _mm_add_ps(fix0,tx);
833 fiy0 = _mm_add_ps(fiy0,ty);
834 fiz0 = _mm_add_ps(fiz0,tz);
836 fjx0 = _mm_add_ps(fjx0,tx);
837 fjy0 = _mm_add_ps(fjy0,ty);
838 fjz0 = _mm_add_ps(fjz0,tz);
840 /**************************
841 * CALCULATE INTERACTIONS *
842 **************************/
844 r01 = _mm_mul_ps(rsq01,rinv01);
845 r01 = _mm_andnot_ps(dummy_mask,r01);
847 /* Calculate table index by multiplying r with table scale and truncate to integer */
848 rt = _mm_mul_ps(r01,vftabscale);
849 vfitab = _mm_cvttps_epi32(rt);
850 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
851 vfitab = _mm_slli_epi32(vfitab,2);
853 /* CUBIC SPLINE TABLE ELECTROSTATICS */
854 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
855 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
856 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
857 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
858 _MM_TRANSPOSE4_PS(Y,F,G,H);
859 Heps = _mm_mul_ps(vfeps,H);
860 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
861 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
862 velec = _mm_mul_ps(qq01,VV);
863 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
864 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
866 /* Update potential sum for this i atom from the interaction with this j atom. */
867 velec = _mm_andnot_ps(dummy_mask,velec);
868 velecsum = _mm_add_ps(velecsum,velec);
872 fscal = _mm_andnot_ps(dummy_mask,fscal);
874 /* Calculate temporary vectorial force */
875 tx = _mm_mul_ps(fscal,dx01);
876 ty = _mm_mul_ps(fscal,dy01);
877 tz = _mm_mul_ps(fscal,dz01);
879 /* Update vectorial force */
880 fix0 = _mm_add_ps(fix0,tx);
881 fiy0 = _mm_add_ps(fiy0,ty);
882 fiz0 = _mm_add_ps(fiz0,tz);
884 fjx1 = _mm_add_ps(fjx1,tx);
885 fjy1 = _mm_add_ps(fjy1,ty);
886 fjz1 = _mm_add_ps(fjz1,tz);
888 /**************************
889 * CALCULATE INTERACTIONS *
890 **************************/
892 r02 = _mm_mul_ps(rsq02,rinv02);
893 r02 = _mm_andnot_ps(dummy_mask,r02);
895 /* Calculate table index by multiplying r with table scale and truncate to integer */
896 rt = _mm_mul_ps(r02,vftabscale);
897 vfitab = _mm_cvttps_epi32(rt);
898 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
899 vfitab = _mm_slli_epi32(vfitab,2);
901 /* CUBIC SPLINE TABLE ELECTROSTATICS */
902 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
903 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
904 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
905 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
906 _MM_TRANSPOSE4_PS(Y,F,G,H);
907 Heps = _mm_mul_ps(vfeps,H);
908 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
909 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
910 velec = _mm_mul_ps(qq02,VV);
911 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
912 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
914 /* Update potential sum for this i atom from the interaction with this j atom. */
915 velec = _mm_andnot_ps(dummy_mask,velec);
916 velecsum = _mm_add_ps(velecsum,velec);
920 fscal = _mm_andnot_ps(dummy_mask,fscal);
922 /* Calculate temporary vectorial force */
923 tx = _mm_mul_ps(fscal,dx02);
924 ty = _mm_mul_ps(fscal,dy02);
925 tz = _mm_mul_ps(fscal,dz02);
927 /* Update vectorial force */
928 fix0 = _mm_add_ps(fix0,tx);
929 fiy0 = _mm_add_ps(fiy0,ty);
930 fiz0 = _mm_add_ps(fiz0,tz);
932 fjx2 = _mm_add_ps(fjx2,tx);
933 fjy2 = _mm_add_ps(fjy2,ty);
934 fjz2 = _mm_add_ps(fjz2,tz);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 r10 = _mm_mul_ps(rsq10,rinv10);
941 r10 = _mm_andnot_ps(dummy_mask,r10);
943 /* Calculate table index by multiplying r with table scale and truncate to integer */
944 rt = _mm_mul_ps(r10,vftabscale);
945 vfitab = _mm_cvttps_epi32(rt);
946 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
947 vfitab = _mm_slli_epi32(vfitab,2);
949 /* CUBIC SPLINE TABLE ELECTROSTATICS */
950 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
951 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
952 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
953 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
954 _MM_TRANSPOSE4_PS(Y,F,G,H);
955 Heps = _mm_mul_ps(vfeps,H);
956 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
957 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
958 velec = _mm_mul_ps(qq10,VV);
959 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
960 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
962 /* Update potential sum for this i atom from the interaction with this j atom. */
963 velec = _mm_andnot_ps(dummy_mask,velec);
964 velecsum = _mm_add_ps(velecsum,velec);
968 fscal = _mm_andnot_ps(dummy_mask,fscal);
970 /* Calculate temporary vectorial force */
971 tx = _mm_mul_ps(fscal,dx10);
972 ty = _mm_mul_ps(fscal,dy10);
973 tz = _mm_mul_ps(fscal,dz10);
975 /* Update vectorial force */
976 fix1 = _mm_add_ps(fix1,tx);
977 fiy1 = _mm_add_ps(fiy1,ty);
978 fiz1 = _mm_add_ps(fiz1,tz);
980 fjx0 = _mm_add_ps(fjx0,tx);
981 fjy0 = _mm_add_ps(fjy0,ty);
982 fjz0 = _mm_add_ps(fjz0,tz);
984 /**************************
985 * CALCULATE INTERACTIONS *
986 **************************/
988 r11 = _mm_mul_ps(rsq11,rinv11);
989 r11 = _mm_andnot_ps(dummy_mask,r11);
991 /* Calculate table index by multiplying r with table scale and truncate to integer */
992 rt = _mm_mul_ps(r11,vftabscale);
993 vfitab = _mm_cvttps_epi32(rt);
994 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
995 vfitab = _mm_slli_epi32(vfitab,2);
997 /* CUBIC SPLINE TABLE ELECTROSTATICS */
998 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
999 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1000 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1001 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1002 _MM_TRANSPOSE4_PS(Y,F,G,H);
1003 Heps = _mm_mul_ps(vfeps,H);
1004 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1005 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1006 velec = _mm_mul_ps(qq11,VV);
1007 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1008 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1010 /* Update potential sum for this i atom from the interaction with this j atom. */
1011 velec = _mm_andnot_ps(dummy_mask,velec);
1012 velecsum = _mm_add_ps(velecsum,velec);
1016 fscal = _mm_andnot_ps(dummy_mask,fscal);
1018 /* Calculate temporary vectorial force */
1019 tx = _mm_mul_ps(fscal,dx11);
1020 ty = _mm_mul_ps(fscal,dy11);
1021 tz = _mm_mul_ps(fscal,dz11);
1023 /* Update vectorial force */
1024 fix1 = _mm_add_ps(fix1,tx);
1025 fiy1 = _mm_add_ps(fiy1,ty);
1026 fiz1 = _mm_add_ps(fiz1,tz);
1028 fjx1 = _mm_add_ps(fjx1,tx);
1029 fjy1 = _mm_add_ps(fjy1,ty);
1030 fjz1 = _mm_add_ps(fjz1,tz);
1032 /**************************
1033 * CALCULATE INTERACTIONS *
1034 **************************/
1036 r12 = _mm_mul_ps(rsq12,rinv12);
1037 r12 = _mm_andnot_ps(dummy_mask,r12);
1039 /* Calculate table index by multiplying r with table scale and truncate to integer */
1040 rt = _mm_mul_ps(r12,vftabscale);
1041 vfitab = _mm_cvttps_epi32(rt);
1042 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1043 vfitab = _mm_slli_epi32(vfitab,2);
1045 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1046 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1047 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1048 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1049 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1050 _MM_TRANSPOSE4_PS(Y,F,G,H);
1051 Heps = _mm_mul_ps(vfeps,H);
1052 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1053 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1054 velec = _mm_mul_ps(qq12,VV);
1055 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1056 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1058 /* Update potential sum for this i atom from the interaction with this j atom. */
1059 velec = _mm_andnot_ps(dummy_mask,velec);
1060 velecsum = _mm_add_ps(velecsum,velec);
1064 fscal = _mm_andnot_ps(dummy_mask,fscal);
1066 /* Calculate temporary vectorial force */
1067 tx = _mm_mul_ps(fscal,dx12);
1068 ty = _mm_mul_ps(fscal,dy12);
1069 tz = _mm_mul_ps(fscal,dz12);
1071 /* Update vectorial force */
1072 fix1 = _mm_add_ps(fix1,tx);
1073 fiy1 = _mm_add_ps(fiy1,ty);
1074 fiz1 = _mm_add_ps(fiz1,tz);
1076 fjx2 = _mm_add_ps(fjx2,tx);
1077 fjy2 = _mm_add_ps(fjy2,ty);
1078 fjz2 = _mm_add_ps(fjz2,tz);
1080 /**************************
1081 * CALCULATE INTERACTIONS *
1082 **************************/
1084 r20 = _mm_mul_ps(rsq20,rinv20);
1085 r20 = _mm_andnot_ps(dummy_mask,r20);
1087 /* Calculate table index by multiplying r with table scale and truncate to integer */
1088 rt = _mm_mul_ps(r20,vftabscale);
1089 vfitab = _mm_cvttps_epi32(rt);
1090 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1091 vfitab = _mm_slli_epi32(vfitab,2);
1093 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1094 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1095 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1096 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1097 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1098 _MM_TRANSPOSE4_PS(Y,F,G,H);
1099 Heps = _mm_mul_ps(vfeps,H);
1100 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1101 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1102 velec = _mm_mul_ps(qq20,VV);
1103 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1104 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1106 /* Update potential sum for this i atom from the interaction with this j atom. */
1107 velec = _mm_andnot_ps(dummy_mask,velec);
1108 velecsum = _mm_add_ps(velecsum,velec);
1112 fscal = _mm_andnot_ps(dummy_mask,fscal);
1114 /* Calculate temporary vectorial force */
1115 tx = _mm_mul_ps(fscal,dx20);
1116 ty = _mm_mul_ps(fscal,dy20);
1117 tz = _mm_mul_ps(fscal,dz20);
1119 /* Update vectorial force */
1120 fix2 = _mm_add_ps(fix2,tx);
1121 fiy2 = _mm_add_ps(fiy2,ty);
1122 fiz2 = _mm_add_ps(fiz2,tz);
1124 fjx0 = _mm_add_ps(fjx0,tx);
1125 fjy0 = _mm_add_ps(fjy0,ty);
1126 fjz0 = _mm_add_ps(fjz0,tz);
1128 /**************************
1129 * CALCULATE INTERACTIONS *
1130 **************************/
1132 r21 = _mm_mul_ps(rsq21,rinv21);
1133 r21 = _mm_andnot_ps(dummy_mask,r21);
1135 /* Calculate table index by multiplying r with table scale and truncate to integer */
1136 rt = _mm_mul_ps(r21,vftabscale);
1137 vfitab = _mm_cvttps_epi32(rt);
1138 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1139 vfitab = _mm_slli_epi32(vfitab,2);
1141 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1142 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1143 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1144 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1145 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1146 _MM_TRANSPOSE4_PS(Y,F,G,H);
1147 Heps = _mm_mul_ps(vfeps,H);
1148 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1149 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1150 velec = _mm_mul_ps(qq21,VV);
1151 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1152 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1154 /* Update potential sum for this i atom from the interaction with this j atom. */
1155 velec = _mm_andnot_ps(dummy_mask,velec);
1156 velecsum = _mm_add_ps(velecsum,velec);
1160 fscal = _mm_andnot_ps(dummy_mask,fscal);
1162 /* Calculate temporary vectorial force */
1163 tx = _mm_mul_ps(fscal,dx21);
1164 ty = _mm_mul_ps(fscal,dy21);
1165 tz = _mm_mul_ps(fscal,dz21);
1167 /* Update vectorial force */
1168 fix2 = _mm_add_ps(fix2,tx);
1169 fiy2 = _mm_add_ps(fiy2,ty);
1170 fiz2 = _mm_add_ps(fiz2,tz);
1172 fjx1 = _mm_add_ps(fjx1,tx);
1173 fjy1 = _mm_add_ps(fjy1,ty);
1174 fjz1 = _mm_add_ps(fjz1,tz);
1176 /**************************
1177 * CALCULATE INTERACTIONS *
1178 **************************/
1180 r22 = _mm_mul_ps(rsq22,rinv22);
1181 r22 = _mm_andnot_ps(dummy_mask,r22);
1183 /* Calculate table index by multiplying r with table scale and truncate to integer */
1184 rt = _mm_mul_ps(r22,vftabscale);
1185 vfitab = _mm_cvttps_epi32(rt);
1186 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1187 vfitab = _mm_slli_epi32(vfitab,2);
1189 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1190 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1191 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1192 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1193 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1194 _MM_TRANSPOSE4_PS(Y,F,G,H);
1195 Heps = _mm_mul_ps(vfeps,H);
1196 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1197 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1198 velec = _mm_mul_ps(qq22,VV);
1199 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1200 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1202 /* Update potential sum for this i atom from the interaction with this j atom. */
1203 velec = _mm_andnot_ps(dummy_mask,velec);
1204 velecsum = _mm_add_ps(velecsum,velec);
1208 fscal = _mm_andnot_ps(dummy_mask,fscal);
1210 /* Calculate temporary vectorial force */
1211 tx = _mm_mul_ps(fscal,dx22);
1212 ty = _mm_mul_ps(fscal,dy22);
1213 tz = _mm_mul_ps(fscal,dz22);
1215 /* Update vectorial force */
1216 fix2 = _mm_add_ps(fix2,tx);
1217 fiy2 = _mm_add_ps(fiy2,ty);
1218 fiz2 = _mm_add_ps(fiz2,tz);
1220 fjx2 = _mm_add_ps(fjx2,tx);
1221 fjy2 = _mm_add_ps(fjy2,ty);
1222 fjz2 = _mm_add_ps(fjz2,tz);
1224 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1225 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1226 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1227 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1229 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1230 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1232 /* Inner loop uses 409 flops */
1235 /* End of innermost loop */
1237 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1238 f+i_coord_offset,fshift+i_shift_offset);
1241 /* Update potential energies */
1242 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1243 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1245 /* Increment number of inner iterations */
1246 inneriter += j_index_end - j_index_start;
1248 /* Outer loop uses 20 flops */
1251 /* Increment number of outer iterations */
1254 /* Update outer/inner flops */
1256 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1259 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
1260 * Electrostatics interaction: CubicSplineTable
1261 * VdW interaction: LennardJones
1262 * Geometry: Water3-Water3
1263 * Calculate force/pot: Force
1266 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse4_1_single
1267 (t_nblist * gmx_restrict nlist,
1268 rvec * gmx_restrict xx,
1269 rvec * gmx_restrict ff,
1270 t_forcerec * gmx_restrict fr,
1271 t_mdatoms * gmx_restrict mdatoms,
1272 nb_kernel_data_t * gmx_restrict kernel_data,
1273 t_nrnb * gmx_restrict nrnb)
1275 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1276 * just 0 for non-waters.
1277 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1278 * jnr indices corresponding to data put in the four positions in the SIMD register.
1280 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1281 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1282 int jnrA,jnrB,jnrC,jnrD;
1283 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1284 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1285 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1286 real rcutoff_scalar;
1287 real *shiftvec,*fshift,*x,*f;
1288 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1289 real scratch[4*DIM];
1290 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1292 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1294 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1296 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1297 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1298 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1299 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1300 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1301 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1302 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1303 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1304 __m128 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1305 __m128 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1306 __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1307 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1308 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1309 __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1310 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1311 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1312 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1315 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1318 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1319 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1321 __m128i ifour = _mm_set1_epi32(4);
1322 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1324 __m128 dummy_mask,cutoff_mask;
1325 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1326 __m128 one = _mm_set1_ps(1.0);
1327 __m128 two = _mm_set1_ps(2.0);
1333 jindex = nlist->jindex;
1335 shiftidx = nlist->shift;
1337 shiftvec = fr->shift_vec[0];
1338 fshift = fr->fshift[0];
1339 facel = _mm_set1_ps(fr->epsfac);
1340 charge = mdatoms->chargeA;
1341 nvdwtype = fr->ntype;
1342 vdwparam = fr->nbfp;
1343 vdwtype = mdatoms->typeA;
1345 vftab = kernel_data->table_elec->data;
1346 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1348 /* Setup water-specific parameters */
1349 inr = nlist->iinr[0];
1350 iq0 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1351 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1352 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1353 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1355 jq0 = _mm_set1_ps(charge[inr+0]);
1356 jq1 = _mm_set1_ps(charge[inr+1]);
1357 jq2 = _mm_set1_ps(charge[inr+2]);
1358 vdwjidx0A = 2*vdwtype[inr+0];
1359 qq00 = _mm_mul_ps(iq0,jq0);
1360 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1361 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1362 qq01 = _mm_mul_ps(iq0,jq1);
1363 qq02 = _mm_mul_ps(iq0,jq2);
1364 qq10 = _mm_mul_ps(iq1,jq0);
1365 qq11 = _mm_mul_ps(iq1,jq1);
1366 qq12 = _mm_mul_ps(iq1,jq2);
1367 qq20 = _mm_mul_ps(iq2,jq0);
1368 qq21 = _mm_mul_ps(iq2,jq1);
1369 qq22 = _mm_mul_ps(iq2,jq2);
1371 /* Avoid stupid compiler warnings */
1372 jnrA = jnrB = jnrC = jnrD = 0;
1373 j_coord_offsetA = 0;
1374 j_coord_offsetB = 0;
1375 j_coord_offsetC = 0;
1376 j_coord_offsetD = 0;
1381 for(iidx=0;iidx<4*DIM;iidx++)
1383 scratch[iidx] = 0.0;
1386 /* Start outer loop over neighborlists */
1387 for(iidx=0; iidx<nri; iidx++)
1389 /* Load shift vector for this list */
1390 i_shift_offset = DIM*shiftidx[iidx];
1392 /* Load limits for loop over neighbors */
1393 j_index_start = jindex[iidx];
1394 j_index_end = jindex[iidx+1];
1396 /* Get outer coordinate index */
1398 i_coord_offset = DIM*inr;
1400 /* Load i particle coords and add shift vector */
1401 gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1402 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1404 fix0 = _mm_setzero_ps();
1405 fiy0 = _mm_setzero_ps();
1406 fiz0 = _mm_setzero_ps();
1407 fix1 = _mm_setzero_ps();
1408 fiy1 = _mm_setzero_ps();
1409 fiz1 = _mm_setzero_ps();
1410 fix2 = _mm_setzero_ps();
1411 fiy2 = _mm_setzero_ps();
1412 fiz2 = _mm_setzero_ps();
1414 /* Start inner kernel loop */
1415 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1418 /* Get j neighbor index, and coordinate index */
1420 jnrB = jjnr[jidx+1];
1421 jnrC = jjnr[jidx+2];
1422 jnrD = jjnr[jidx+3];
1423 j_coord_offsetA = DIM*jnrA;
1424 j_coord_offsetB = DIM*jnrB;
1425 j_coord_offsetC = DIM*jnrC;
1426 j_coord_offsetD = DIM*jnrD;
1428 /* load j atom coordinates */
1429 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1430 x+j_coord_offsetC,x+j_coord_offsetD,
1431 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1433 /* Calculate displacement vector */
1434 dx00 = _mm_sub_ps(ix0,jx0);
1435 dy00 = _mm_sub_ps(iy0,jy0);
1436 dz00 = _mm_sub_ps(iz0,jz0);
1437 dx01 = _mm_sub_ps(ix0,jx1);
1438 dy01 = _mm_sub_ps(iy0,jy1);
1439 dz01 = _mm_sub_ps(iz0,jz1);
1440 dx02 = _mm_sub_ps(ix0,jx2);
1441 dy02 = _mm_sub_ps(iy0,jy2);
1442 dz02 = _mm_sub_ps(iz0,jz2);
1443 dx10 = _mm_sub_ps(ix1,jx0);
1444 dy10 = _mm_sub_ps(iy1,jy0);
1445 dz10 = _mm_sub_ps(iz1,jz0);
1446 dx11 = _mm_sub_ps(ix1,jx1);
1447 dy11 = _mm_sub_ps(iy1,jy1);
1448 dz11 = _mm_sub_ps(iz1,jz1);
1449 dx12 = _mm_sub_ps(ix1,jx2);
1450 dy12 = _mm_sub_ps(iy1,jy2);
1451 dz12 = _mm_sub_ps(iz1,jz2);
1452 dx20 = _mm_sub_ps(ix2,jx0);
1453 dy20 = _mm_sub_ps(iy2,jy0);
1454 dz20 = _mm_sub_ps(iz2,jz0);
1455 dx21 = _mm_sub_ps(ix2,jx1);
1456 dy21 = _mm_sub_ps(iy2,jy1);
1457 dz21 = _mm_sub_ps(iz2,jz1);
1458 dx22 = _mm_sub_ps(ix2,jx2);
1459 dy22 = _mm_sub_ps(iy2,jy2);
1460 dz22 = _mm_sub_ps(iz2,jz2);
1462 /* Calculate squared distance and things based on it */
1463 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1464 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1465 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1466 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1467 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1468 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1469 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1470 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1471 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1473 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1474 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1475 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1476 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1477 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1478 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1479 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1480 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1481 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1483 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1485 fjx0 = _mm_setzero_ps();
1486 fjy0 = _mm_setzero_ps();
1487 fjz0 = _mm_setzero_ps();
1488 fjx1 = _mm_setzero_ps();
1489 fjy1 = _mm_setzero_ps();
1490 fjz1 = _mm_setzero_ps();
1491 fjx2 = _mm_setzero_ps();
1492 fjy2 = _mm_setzero_ps();
1493 fjz2 = _mm_setzero_ps();
1495 /**************************
1496 * CALCULATE INTERACTIONS *
1497 **************************/
1499 r00 = _mm_mul_ps(rsq00,rinv00);
1501 /* Calculate table index by multiplying r with table scale and truncate to integer */
1502 rt = _mm_mul_ps(r00,vftabscale);
1503 vfitab = _mm_cvttps_epi32(rt);
1504 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1505 vfitab = _mm_slli_epi32(vfitab,2);
1507 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1508 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1509 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1510 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1511 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1512 _MM_TRANSPOSE4_PS(Y,F,G,H);
1513 Heps = _mm_mul_ps(vfeps,H);
1514 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1515 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1516 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1518 /* LENNARD-JONES DISPERSION/REPULSION */
1520 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1521 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1523 fscal = _mm_add_ps(felec,fvdw);
1525 /* Calculate temporary vectorial force */
1526 tx = _mm_mul_ps(fscal,dx00);
1527 ty = _mm_mul_ps(fscal,dy00);
1528 tz = _mm_mul_ps(fscal,dz00);
1530 /* Update vectorial force */
1531 fix0 = _mm_add_ps(fix0,tx);
1532 fiy0 = _mm_add_ps(fiy0,ty);
1533 fiz0 = _mm_add_ps(fiz0,tz);
1535 fjx0 = _mm_add_ps(fjx0,tx);
1536 fjy0 = _mm_add_ps(fjy0,ty);
1537 fjz0 = _mm_add_ps(fjz0,tz);
1539 /**************************
1540 * CALCULATE INTERACTIONS *
1541 **************************/
1543 r01 = _mm_mul_ps(rsq01,rinv01);
1545 /* Calculate table index by multiplying r with table scale and truncate to integer */
1546 rt = _mm_mul_ps(r01,vftabscale);
1547 vfitab = _mm_cvttps_epi32(rt);
1548 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1549 vfitab = _mm_slli_epi32(vfitab,2);
1551 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1552 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1553 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1554 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1555 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1556 _MM_TRANSPOSE4_PS(Y,F,G,H);
1557 Heps = _mm_mul_ps(vfeps,H);
1558 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1559 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1560 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1564 /* Calculate temporary vectorial force */
1565 tx = _mm_mul_ps(fscal,dx01);
1566 ty = _mm_mul_ps(fscal,dy01);
1567 tz = _mm_mul_ps(fscal,dz01);
1569 /* Update vectorial force */
1570 fix0 = _mm_add_ps(fix0,tx);
1571 fiy0 = _mm_add_ps(fiy0,ty);
1572 fiz0 = _mm_add_ps(fiz0,tz);
1574 fjx1 = _mm_add_ps(fjx1,tx);
1575 fjy1 = _mm_add_ps(fjy1,ty);
1576 fjz1 = _mm_add_ps(fjz1,tz);
1578 /**************************
1579 * CALCULATE INTERACTIONS *
1580 **************************/
1582 r02 = _mm_mul_ps(rsq02,rinv02);
1584 /* Calculate table index by multiplying r with table scale and truncate to integer */
1585 rt = _mm_mul_ps(r02,vftabscale);
1586 vfitab = _mm_cvttps_epi32(rt);
1587 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1588 vfitab = _mm_slli_epi32(vfitab,2);
1590 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1591 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1592 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1593 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1594 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1595 _MM_TRANSPOSE4_PS(Y,F,G,H);
1596 Heps = _mm_mul_ps(vfeps,H);
1597 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1598 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1599 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1603 /* Calculate temporary vectorial force */
1604 tx = _mm_mul_ps(fscal,dx02);
1605 ty = _mm_mul_ps(fscal,dy02);
1606 tz = _mm_mul_ps(fscal,dz02);
1608 /* Update vectorial force */
1609 fix0 = _mm_add_ps(fix0,tx);
1610 fiy0 = _mm_add_ps(fiy0,ty);
1611 fiz0 = _mm_add_ps(fiz0,tz);
1613 fjx2 = _mm_add_ps(fjx2,tx);
1614 fjy2 = _mm_add_ps(fjy2,ty);
1615 fjz2 = _mm_add_ps(fjz2,tz);
1617 /**************************
1618 * CALCULATE INTERACTIONS *
1619 **************************/
1621 r10 = _mm_mul_ps(rsq10,rinv10);
1623 /* Calculate table index by multiplying r with table scale and truncate to integer */
1624 rt = _mm_mul_ps(r10,vftabscale);
1625 vfitab = _mm_cvttps_epi32(rt);
1626 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1627 vfitab = _mm_slli_epi32(vfitab,2);
1629 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1630 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1631 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1632 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1633 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1634 _MM_TRANSPOSE4_PS(Y,F,G,H);
1635 Heps = _mm_mul_ps(vfeps,H);
1636 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1637 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1638 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1642 /* Calculate temporary vectorial force */
1643 tx = _mm_mul_ps(fscal,dx10);
1644 ty = _mm_mul_ps(fscal,dy10);
1645 tz = _mm_mul_ps(fscal,dz10);
1647 /* Update vectorial force */
1648 fix1 = _mm_add_ps(fix1,tx);
1649 fiy1 = _mm_add_ps(fiy1,ty);
1650 fiz1 = _mm_add_ps(fiz1,tz);
1652 fjx0 = _mm_add_ps(fjx0,tx);
1653 fjy0 = _mm_add_ps(fjy0,ty);
1654 fjz0 = _mm_add_ps(fjz0,tz);
1656 /**************************
1657 * CALCULATE INTERACTIONS *
1658 **************************/
1660 r11 = _mm_mul_ps(rsq11,rinv11);
1662 /* Calculate table index by multiplying r with table scale and truncate to integer */
1663 rt = _mm_mul_ps(r11,vftabscale);
1664 vfitab = _mm_cvttps_epi32(rt);
1665 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1666 vfitab = _mm_slli_epi32(vfitab,2);
1668 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1669 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1670 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1671 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1672 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1673 _MM_TRANSPOSE4_PS(Y,F,G,H);
1674 Heps = _mm_mul_ps(vfeps,H);
1675 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1676 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1677 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1681 /* Calculate temporary vectorial force */
1682 tx = _mm_mul_ps(fscal,dx11);
1683 ty = _mm_mul_ps(fscal,dy11);
1684 tz = _mm_mul_ps(fscal,dz11);
1686 /* Update vectorial force */
1687 fix1 = _mm_add_ps(fix1,tx);
1688 fiy1 = _mm_add_ps(fiy1,ty);
1689 fiz1 = _mm_add_ps(fiz1,tz);
1691 fjx1 = _mm_add_ps(fjx1,tx);
1692 fjy1 = _mm_add_ps(fjy1,ty);
1693 fjz1 = _mm_add_ps(fjz1,tz);
1695 /**************************
1696 * CALCULATE INTERACTIONS *
1697 **************************/
1699 r12 = _mm_mul_ps(rsq12,rinv12);
1701 /* Calculate table index by multiplying r with table scale and truncate to integer */
1702 rt = _mm_mul_ps(r12,vftabscale);
1703 vfitab = _mm_cvttps_epi32(rt);
1704 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1705 vfitab = _mm_slli_epi32(vfitab,2);
1707 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1708 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1709 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1710 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1711 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1712 _MM_TRANSPOSE4_PS(Y,F,G,H);
1713 Heps = _mm_mul_ps(vfeps,H);
1714 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1715 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1716 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1720 /* Calculate temporary vectorial force */
1721 tx = _mm_mul_ps(fscal,dx12);
1722 ty = _mm_mul_ps(fscal,dy12);
1723 tz = _mm_mul_ps(fscal,dz12);
1725 /* Update vectorial force */
1726 fix1 = _mm_add_ps(fix1,tx);
1727 fiy1 = _mm_add_ps(fiy1,ty);
1728 fiz1 = _mm_add_ps(fiz1,tz);
1730 fjx2 = _mm_add_ps(fjx2,tx);
1731 fjy2 = _mm_add_ps(fjy2,ty);
1732 fjz2 = _mm_add_ps(fjz2,tz);
1734 /**************************
1735 * CALCULATE INTERACTIONS *
1736 **************************/
1738 r20 = _mm_mul_ps(rsq20,rinv20);
1740 /* Calculate table index by multiplying r with table scale and truncate to integer */
1741 rt = _mm_mul_ps(r20,vftabscale);
1742 vfitab = _mm_cvttps_epi32(rt);
1743 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1744 vfitab = _mm_slli_epi32(vfitab,2);
1746 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1747 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1748 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1749 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1750 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1751 _MM_TRANSPOSE4_PS(Y,F,G,H);
1752 Heps = _mm_mul_ps(vfeps,H);
1753 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1754 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1755 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1759 /* Calculate temporary vectorial force */
1760 tx = _mm_mul_ps(fscal,dx20);
1761 ty = _mm_mul_ps(fscal,dy20);
1762 tz = _mm_mul_ps(fscal,dz20);
1764 /* Update vectorial force */
1765 fix2 = _mm_add_ps(fix2,tx);
1766 fiy2 = _mm_add_ps(fiy2,ty);
1767 fiz2 = _mm_add_ps(fiz2,tz);
1769 fjx0 = _mm_add_ps(fjx0,tx);
1770 fjy0 = _mm_add_ps(fjy0,ty);
1771 fjz0 = _mm_add_ps(fjz0,tz);
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 r21 = _mm_mul_ps(rsq21,rinv21);
1779 /* Calculate table index by multiplying r with table scale and truncate to integer */
1780 rt = _mm_mul_ps(r21,vftabscale);
1781 vfitab = _mm_cvttps_epi32(rt);
1782 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1783 vfitab = _mm_slli_epi32(vfitab,2);
1785 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1786 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1787 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1788 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1789 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1790 _MM_TRANSPOSE4_PS(Y,F,G,H);
1791 Heps = _mm_mul_ps(vfeps,H);
1792 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1793 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1794 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1798 /* Calculate temporary vectorial force */
1799 tx = _mm_mul_ps(fscal,dx21);
1800 ty = _mm_mul_ps(fscal,dy21);
1801 tz = _mm_mul_ps(fscal,dz21);
1803 /* Update vectorial force */
1804 fix2 = _mm_add_ps(fix2,tx);
1805 fiy2 = _mm_add_ps(fiy2,ty);
1806 fiz2 = _mm_add_ps(fiz2,tz);
1808 fjx1 = _mm_add_ps(fjx1,tx);
1809 fjy1 = _mm_add_ps(fjy1,ty);
1810 fjz1 = _mm_add_ps(fjz1,tz);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 r22 = _mm_mul_ps(rsq22,rinv22);
1818 /* Calculate table index by multiplying r with table scale and truncate to integer */
1819 rt = _mm_mul_ps(r22,vftabscale);
1820 vfitab = _mm_cvttps_epi32(rt);
1821 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1822 vfitab = _mm_slli_epi32(vfitab,2);
1824 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1825 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1826 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1827 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1828 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1829 _MM_TRANSPOSE4_PS(Y,F,G,H);
1830 Heps = _mm_mul_ps(vfeps,H);
1831 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1832 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1833 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1837 /* Calculate temporary vectorial force */
1838 tx = _mm_mul_ps(fscal,dx22);
1839 ty = _mm_mul_ps(fscal,dy22);
1840 tz = _mm_mul_ps(fscal,dz22);
1842 /* Update vectorial force */
1843 fix2 = _mm_add_ps(fix2,tx);
1844 fiy2 = _mm_add_ps(fiy2,ty);
1845 fiz2 = _mm_add_ps(fiz2,tz);
1847 fjx2 = _mm_add_ps(fjx2,tx);
1848 fjy2 = _mm_add_ps(fjy2,ty);
1849 fjz2 = _mm_add_ps(fjz2,tz);
1851 fjptrA = f+j_coord_offsetA;
1852 fjptrB = f+j_coord_offsetB;
1853 fjptrC = f+j_coord_offsetC;
1854 fjptrD = f+j_coord_offsetD;
1856 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1857 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1859 /* Inner loop uses 359 flops */
1862 if(jidx<j_index_end)
1865 /* Get j neighbor index, and coordinate index */
1866 jnrlistA = jjnr[jidx];
1867 jnrlistB = jjnr[jidx+1];
1868 jnrlistC = jjnr[jidx+2];
1869 jnrlistD = jjnr[jidx+3];
1870 /* Sign of each element will be negative for non-real atoms.
1871 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1872 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1874 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1875 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1876 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1877 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1878 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1879 j_coord_offsetA = DIM*jnrA;
1880 j_coord_offsetB = DIM*jnrB;
1881 j_coord_offsetC = DIM*jnrC;
1882 j_coord_offsetD = DIM*jnrD;
1884 /* load j atom coordinates */
1885 gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1886 x+j_coord_offsetC,x+j_coord_offsetD,
1887 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1889 /* Calculate displacement vector */
1890 dx00 = _mm_sub_ps(ix0,jx0);
1891 dy00 = _mm_sub_ps(iy0,jy0);
1892 dz00 = _mm_sub_ps(iz0,jz0);
1893 dx01 = _mm_sub_ps(ix0,jx1);
1894 dy01 = _mm_sub_ps(iy0,jy1);
1895 dz01 = _mm_sub_ps(iz0,jz1);
1896 dx02 = _mm_sub_ps(ix0,jx2);
1897 dy02 = _mm_sub_ps(iy0,jy2);
1898 dz02 = _mm_sub_ps(iz0,jz2);
1899 dx10 = _mm_sub_ps(ix1,jx0);
1900 dy10 = _mm_sub_ps(iy1,jy0);
1901 dz10 = _mm_sub_ps(iz1,jz0);
1902 dx11 = _mm_sub_ps(ix1,jx1);
1903 dy11 = _mm_sub_ps(iy1,jy1);
1904 dz11 = _mm_sub_ps(iz1,jz1);
1905 dx12 = _mm_sub_ps(ix1,jx2);
1906 dy12 = _mm_sub_ps(iy1,jy2);
1907 dz12 = _mm_sub_ps(iz1,jz2);
1908 dx20 = _mm_sub_ps(ix2,jx0);
1909 dy20 = _mm_sub_ps(iy2,jy0);
1910 dz20 = _mm_sub_ps(iz2,jz0);
1911 dx21 = _mm_sub_ps(ix2,jx1);
1912 dy21 = _mm_sub_ps(iy2,jy1);
1913 dz21 = _mm_sub_ps(iz2,jz1);
1914 dx22 = _mm_sub_ps(ix2,jx2);
1915 dy22 = _mm_sub_ps(iy2,jy2);
1916 dz22 = _mm_sub_ps(iz2,jz2);
1918 /* Calculate squared distance and things based on it */
1919 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1920 rsq01 = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1921 rsq02 = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1922 rsq10 = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1923 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1924 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1925 rsq20 = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1926 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1927 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1929 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1930 rinv01 = gmx_mm_invsqrt_ps(rsq01);
1931 rinv02 = gmx_mm_invsqrt_ps(rsq02);
1932 rinv10 = gmx_mm_invsqrt_ps(rsq10);
1933 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1934 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1935 rinv20 = gmx_mm_invsqrt_ps(rsq20);
1936 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1937 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1939 rinvsq00 = _mm_mul_ps(rinv00,rinv00);
1941 fjx0 = _mm_setzero_ps();
1942 fjy0 = _mm_setzero_ps();
1943 fjz0 = _mm_setzero_ps();
1944 fjx1 = _mm_setzero_ps();
1945 fjy1 = _mm_setzero_ps();
1946 fjz1 = _mm_setzero_ps();
1947 fjx2 = _mm_setzero_ps();
1948 fjy2 = _mm_setzero_ps();
1949 fjz2 = _mm_setzero_ps();
1951 /**************************
1952 * CALCULATE INTERACTIONS *
1953 **************************/
1955 r00 = _mm_mul_ps(rsq00,rinv00);
1956 r00 = _mm_andnot_ps(dummy_mask,r00);
1958 /* Calculate table index by multiplying r with table scale and truncate to integer */
1959 rt = _mm_mul_ps(r00,vftabscale);
1960 vfitab = _mm_cvttps_epi32(rt);
1961 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1962 vfitab = _mm_slli_epi32(vfitab,2);
1964 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1965 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1966 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1967 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1968 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1969 _MM_TRANSPOSE4_PS(Y,F,G,H);
1970 Heps = _mm_mul_ps(vfeps,H);
1971 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1972 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1973 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1975 /* LENNARD-JONES DISPERSION/REPULSION */
1977 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1978 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1980 fscal = _mm_add_ps(felec,fvdw);
1982 fscal = _mm_andnot_ps(dummy_mask,fscal);
1984 /* Calculate temporary vectorial force */
1985 tx = _mm_mul_ps(fscal,dx00);
1986 ty = _mm_mul_ps(fscal,dy00);
1987 tz = _mm_mul_ps(fscal,dz00);
1989 /* Update vectorial force */
1990 fix0 = _mm_add_ps(fix0,tx);
1991 fiy0 = _mm_add_ps(fiy0,ty);
1992 fiz0 = _mm_add_ps(fiz0,tz);
1994 fjx0 = _mm_add_ps(fjx0,tx);
1995 fjy0 = _mm_add_ps(fjy0,ty);
1996 fjz0 = _mm_add_ps(fjz0,tz);
1998 /**************************
1999 * CALCULATE INTERACTIONS *
2000 **************************/
2002 r01 = _mm_mul_ps(rsq01,rinv01);
2003 r01 = _mm_andnot_ps(dummy_mask,r01);
2005 /* Calculate table index by multiplying r with table scale and truncate to integer */
2006 rt = _mm_mul_ps(r01,vftabscale);
2007 vfitab = _mm_cvttps_epi32(rt);
2008 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2009 vfitab = _mm_slli_epi32(vfitab,2);
2011 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2012 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2013 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2014 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2015 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2016 _MM_TRANSPOSE4_PS(Y,F,G,H);
2017 Heps = _mm_mul_ps(vfeps,H);
2018 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2019 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2020 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2024 fscal = _mm_andnot_ps(dummy_mask,fscal);
2026 /* Calculate temporary vectorial force */
2027 tx = _mm_mul_ps(fscal,dx01);
2028 ty = _mm_mul_ps(fscal,dy01);
2029 tz = _mm_mul_ps(fscal,dz01);
2031 /* Update vectorial force */
2032 fix0 = _mm_add_ps(fix0,tx);
2033 fiy0 = _mm_add_ps(fiy0,ty);
2034 fiz0 = _mm_add_ps(fiz0,tz);
2036 fjx1 = _mm_add_ps(fjx1,tx);
2037 fjy1 = _mm_add_ps(fjy1,ty);
2038 fjz1 = _mm_add_ps(fjz1,tz);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 r02 = _mm_mul_ps(rsq02,rinv02);
2045 r02 = _mm_andnot_ps(dummy_mask,r02);
2047 /* Calculate table index by multiplying r with table scale and truncate to integer */
2048 rt = _mm_mul_ps(r02,vftabscale);
2049 vfitab = _mm_cvttps_epi32(rt);
2050 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2051 vfitab = _mm_slli_epi32(vfitab,2);
2053 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2054 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2055 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2056 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2057 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2058 _MM_TRANSPOSE4_PS(Y,F,G,H);
2059 Heps = _mm_mul_ps(vfeps,H);
2060 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2061 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2062 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2066 fscal = _mm_andnot_ps(dummy_mask,fscal);
2068 /* Calculate temporary vectorial force */
2069 tx = _mm_mul_ps(fscal,dx02);
2070 ty = _mm_mul_ps(fscal,dy02);
2071 tz = _mm_mul_ps(fscal,dz02);
2073 /* Update vectorial force */
2074 fix0 = _mm_add_ps(fix0,tx);
2075 fiy0 = _mm_add_ps(fiy0,ty);
2076 fiz0 = _mm_add_ps(fiz0,tz);
2078 fjx2 = _mm_add_ps(fjx2,tx);
2079 fjy2 = _mm_add_ps(fjy2,ty);
2080 fjz2 = _mm_add_ps(fjz2,tz);
2082 /**************************
2083 * CALCULATE INTERACTIONS *
2084 **************************/
2086 r10 = _mm_mul_ps(rsq10,rinv10);
2087 r10 = _mm_andnot_ps(dummy_mask,r10);
2089 /* Calculate table index by multiplying r with table scale and truncate to integer */
2090 rt = _mm_mul_ps(r10,vftabscale);
2091 vfitab = _mm_cvttps_epi32(rt);
2092 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2093 vfitab = _mm_slli_epi32(vfitab,2);
2095 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2096 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2097 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2098 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2099 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2100 _MM_TRANSPOSE4_PS(Y,F,G,H);
2101 Heps = _mm_mul_ps(vfeps,H);
2102 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2103 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2104 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2108 fscal = _mm_andnot_ps(dummy_mask,fscal);
2110 /* Calculate temporary vectorial force */
2111 tx = _mm_mul_ps(fscal,dx10);
2112 ty = _mm_mul_ps(fscal,dy10);
2113 tz = _mm_mul_ps(fscal,dz10);
2115 /* Update vectorial force */
2116 fix1 = _mm_add_ps(fix1,tx);
2117 fiy1 = _mm_add_ps(fiy1,ty);
2118 fiz1 = _mm_add_ps(fiz1,tz);
2120 fjx0 = _mm_add_ps(fjx0,tx);
2121 fjy0 = _mm_add_ps(fjy0,ty);
2122 fjz0 = _mm_add_ps(fjz0,tz);
2124 /**************************
2125 * CALCULATE INTERACTIONS *
2126 **************************/
2128 r11 = _mm_mul_ps(rsq11,rinv11);
2129 r11 = _mm_andnot_ps(dummy_mask,r11);
2131 /* Calculate table index by multiplying r with table scale and truncate to integer */
2132 rt = _mm_mul_ps(r11,vftabscale);
2133 vfitab = _mm_cvttps_epi32(rt);
2134 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2135 vfitab = _mm_slli_epi32(vfitab,2);
2137 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2138 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2139 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2140 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2141 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2142 _MM_TRANSPOSE4_PS(Y,F,G,H);
2143 Heps = _mm_mul_ps(vfeps,H);
2144 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2145 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2146 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2150 fscal = _mm_andnot_ps(dummy_mask,fscal);
2152 /* Calculate temporary vectorial force */
2153 tx = _mm_mul_ps(fscal,dx11);
2154 ty = _mm_mul_ps(fscal,dy11);
2155 tz = _mm_mul_ps(fscal,dz11);
2157 /* Update vectorial force */
2158 fix1 = _mm_add_ps(fix1,tx);
2159 fiy1 = _mm_add_ps(fiy1,ty);
2160 fiz1 = _mm_add_ps(fiz1,tz);
2162 fjx1 = _mm_add_ps(fjx1,tx);
2163 fjy1 = _mm_add_ps(fjy1,ty);
2164 fjz1 = _mm_add_ps(fjz1,tz);
2166 /**************************
2167 * CALCULATE INTERACTIONS *
2168 **************************/
2170 r12 = _mm_mul_ps(rsq12,rinv12);
2171 r12 = _mm_andnot_ps(dummy_mask,r12);
2173 /* Calculate table index by multiplying r with table scale and truncate to integer */
2174 rt = _mm_mul_ps(r12,vftabscale);
2175 vfitab = _mm_cvttps_epi32(rt);
2176 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2177 vfitab = _mm_slli_epi32(vfitab,2);
2179 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2180 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2181 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2182 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2183 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2184 _MM_TRANSPOSE4_PS(Y,F,G,H);
2185 Heps = _mm_mul_ps(vfeps,H);
2186 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2187 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2188 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2192 fscal = _mm_andnot_ps(dummy_mask,fscal);
2194 /* Calculate temporary vectorial force */
2195 tx = _mm_mul_ps(fscal,dx12);
2196 ty = _mm_mul_ps(fscal,dy12);
2197 tz = _mm_mul_ps(fscal,dz12);
2199 /* Update vectorial force */
2200 fix1 = _mm_add_ps(fix1,tx);
2201 fiy1 = _mm_add_ps(fiy1,ty);
2202 fiz1 = _mm_add_ps(fiz1,tz);
2204 fjx2 = _mm_add_ps(fjx2,tx);
2205 fjy2 = _mm_add_ps(fjy2,ty);
2206 fjz2 = _mm_add_ps(fjz2,tz);
2208 /**************************
2209 * CALCULATE INTERACTIONS *
2210 **************************/
2212 r20 = _mm_mul_ps(rsq20,rinv20);
2213 r20 = _mm_andnot_ps(dummy_mask,r20);
2215 /* Calculate table index by multiplying r with table scale and truncate to integer */
2216 rt = _mm_mul_ps(r20,vftabscale);
2217 vfitab = _mm_cvttps_epi32(rt);
2218 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2219 vfitab = _mm_slli_epi32(vfitab,2);
2221 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2222 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2223 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2224 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2225 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2226 _MM_TRANSPOSE4_PS(Y,F,G,H);
2227 Heps = _mm_mul_ps(vfeps,H);
2228 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2229 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2230 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2234 fscal = _mm_andnot_ps(dummy_mask,fscal);
2236 /* Calculate temporary vectorial force */
2237 tx = _mm_mul_ps(fscal,dx20);
2238 ty = _mm_mul_ps(fscal,dy20);
2239 tz = _mm_mul_ps(fscal,dz20);
2241 /* Update vectorial force */
2242 fix2 = _mm_add_ps(fix2,tx);
2243 fiy2 = _mm_add_ps(fiy2,ty);
2244 fiz2 = _mm_add_ps(fiz2,tz);
2246 fjx0 = _mm_add_ps(fjx0,tx);
2247 fjy0 = _mm_add_ps(fjy0,ty);
2248 fjz0 = _mm_add_ps(fjz0,tz);
2250 /**************************
2251 * CALCULATE INTERACTIONS *
2252 **************************/
2254 r21 = _mm_mul_ps(rsq21,rinv21);
2255 r21 = _mm_andnot_ps(dummy_mask,r21);
2257 /* Calculate table index by multiplying r with table scale and truncate to integer */
2258 rt = _mm_mul_ps(r21,vftabscale);
2259 vfitab = _mm_cvttps_epi32(rt);
2260 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2261 vfitab = _mm_slli_epi32(vfitab,2);
2263 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2264 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2265 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2266 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2267 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2268 _MM_TRANSPOSE4_PS(Y,F,G,H);
2269 Heps = _mm_mul_ps(vfeps,H);
2270 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2271 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2272 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2276 fscal = _mm_andnot_ps(dummy_mask,fscal);
2278 /* Calculate temporary vectorial force */
2279 tx = _mm_mul_ps(fscal,dx21);
2280 ty = _mm_mul_ps(fscal,dy21);
2281 tz = _mm_mul_ps(fscal,dz21);
2283 /* Update vectorial force */
2284 fix2 = _mm_add_ps(fix2,tx);
2285 fiy2 = _mm_add_ps(fiy2,ty);
2286 fiz2 = _mm_add_ps(fiz2,tz);
2288 fjx1 = _mm_add_ps(fjx1,tx);
2289 fjy1 = _mm_add_ps(fjy1,ty);
2290 fjz1 = _mm_add_ps(fjz1,tz);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 r22 = _mm_mul_ps(rsq22,rinv22);
2297 r22 = _mm_andnot_ps(dummy_mask,r22);
2299 /* Calculate table index by multiplying r with table scale and truncate to integer */
2300 rt = _mm_mul_ps(r22,vftabscale);
2301 vfitab = _mm_cvttps_epi32(rt);
2302 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2303 vfitab = _mm_slli_epi32(vfitab,2);
2305 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2306 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2307 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2308 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2309 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2310 _MM_TRANSPOSE4_PS(Y,F,G,H);
2311 Heps = _mm_mul_ps(vfeps,H);
2312 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2313 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2314 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2318 fscal = _mm_andnot_ps(dummy_mask,fscal);
2320 /* Calculate temporary vectorial force */
2321 tx = _mm_mul_ps(fscal,dx22);
2322 ty = _mm_mul_ps(fscal,dy22);
2323 tz = _mm_mul_ps(fscal,dz22);
2325 /* Update vectorial force */
2326 fix2 = _mm_add_ps(fix2,tx);
2327 fiy2 = _mm_add_ps(fiy2,ty);
2328 fiz2 = _mm_add_ps(fiz2,tz);
2330 fjx2 = _mm_add_ps(fjx2,tx);
2331 fjy2 = _mm_add_ps(fjy2,ty);
2332 fjz2 = _mm_add_ps(fjz2,tz);
2334 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2335 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2336 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2337 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2339 gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2340 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2342 /* Inner loop uses 368 flops */
2345 /* End of innermost loop */
2347 gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2348 f+i_coord_offset,fshift+i_shift_offset);
2350 /* Increment number of inner iterations */
2351 inneriter += j_index_end - j_index_start;
2353 /* Outer loop uses 18 flops */
2356 /* Increment number of outer iterations */
2359 /* Update outer/inner flops */
2361 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);