2 * Note: this file was generated by the Gromacs sse4_1_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse4_1_single.h"
34 #include "kernelutil_x86_sse4_1_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_sse4_1_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
65 real *shiftvec,*fshift,*x,*f;
66 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
68 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
98 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
102 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
104 __m128i ifour = _mm_set1_epi32(4);
105 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
107 __m128 dummy_mask,cutoff_mask;
108 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109 __m128 one = _mm_set1_ps(1.0);
110 __m128 two = _mm_set1_ps(2.0);
116 jindex = nlist->jindex;
118 shiftidx = nlist->shift;
120 shiftvec = fr->shift_vec[0];
121 fshift = fr->fshift[0];
122 facel = _mm_set1_ps(fr->epsfac);
123 charge = mdatoms->chargeA;
124 nvdwtype = fr->ntype;
126 vdwtype = mdatoms->typeA;
128 vftab = kernel_data->table_elec->data;
129 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
131 /* Setup water-specific parameters */
132 inr = nlist->iinr[0];
133 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
138 jq1 = _mm_set1_ps(charge[inr+1]);
139 jq2 = _mm_set1_ps(charge[inr+2]);
140 jq3 = _mm_set1_ps(charge[inr+3]);
141 vdwjidx0A = 2*vdwtype[inr+0];
142 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
143 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
144 qq11 = _mm_mul_ps(iq1,jq1);
145 qq12 = _mm_mul_ps(iq1,jq2);
146 qq13 = _mm_mul_ps(iq1,jq3);
147 qq21 = _mm_mul_ps(iq2,jq1);
148 qq22 = _mm_mul_ps(iq2,jq2);
149 qq23 = _mm_mul_ps(iq2,jq3);
150 qq31 = _mm_mul_ps(iq3,jq1);
151 qq32 = _mm_mul_ps(iq3,jq2);
152 qq33 = _mm_mul_ps(iq3,jq3);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = 0;
164 for(iidx=0;iidx<4*DIM;iidx++)
169 /* Start outer loop over neighborlists */
170 for(iidx=0; iidx<nri; iidx++)
172 /* Load shift vector for this list */
173 i_shift_offset = DIM*shiftidx[iidx];
175 /* Load limits for loop over neighbors */
176 j_index_start = jindex[iidx];
177 j_index_end = jindex[iidx+1];
179 /* Get outer coordinate index */
181 i_coord_offset = DIM*inr;
183 /* Load i particle coords and add shift vector */
184 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
185 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
187 fix0 = _mm_setzero_ps();
188 fiy0 = _mm_setzero_ps();
189 fiz0 = _mm_setzero_ps();
190 fix1 = _mm_setzero_ps();
191 fiy1 = _mm_setzero_ps();
192 fiz1 = _mm_setzero_ps();
193 fix2 = _mm_setzero_ps();
194 fiy2 = _mm_setzero_ps();
195 fiz2 = _mm_setzero_ps();
196 fix3 = _mm_setzero_ps();
197 fiy3 = _mm_setzero_ps();
198 fiz3 = _mm_setzero_ps();
200 /* Reset potential sums */
201 velecsum = _mm_setzero_ps();
202 vvdwsum = _mm_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
208 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
215 j_coord_offsetC = DIM*jnrC;
216 j_coord_offsetD = DIM*jnrD;
218 /* load j atom coordinates */
219 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
220 x+j_coord_offsetC,x+j_coord_offsetD,
221 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
222 &jy2,&jz2,&jx3,&jy3,&jz3);
224 /* Calculate displacement vector */
225 dx00 = _mm_sub_ps(ix0,jx0);
226 dy00 = _mm_sub_ps(iy0,jy0);
227 dz00 = _mm_sub_ps(iz0,jz0);
228 dx11 = _mm_sub_ps(ix1,jx1);
229 dy11 = _mm_sub_ps(iy1,jy1);
230 dz11 = _mm_sub_ps(iz1,jz1);
231 dx12 = _mm_sub_ps(ix1,jx2);
232 dy12 = _mm_sub_ps(iy1,jy2);
233 dz12 = _mm_sub_ps(iz1,jz2);
234 dx13 = _mm_sub_ps(ix1,jx3);
235 dy13 = _mm_sub_ps(iy1,jy3);
236 dz13 = _mm_sub_ps(iz1,jz3);
237 dx21 = _mm_sub_ps(ix2,jx1);
238 dy21 = _mm_sub_ps(iy2,jy1);
239 dz21 = _mm_sub_ps(iz2,jz1);
240 dx22 = _mm_sub_ps(ix2,jx2);
241 dy22 = _mm_sub_ps(iy2,jy2);
242 dz22 = _mm_sub_ps(iz2,jz2);
243 dx23 = _mm_sub_ps(ix2,jx3);
244 dy23 = _mm_sub_ps(iy2,jy3);
245 dz23 = _mm_sub_ps(iz2,jz3);
246 dx31 = _mm_sub_ps(ix3,jx1);
247 dy31 = _mm_sub_ps(iy3,jy1);
248 dz31 = _mm_sub_ps(iz3,jz1);
249 dx32 = _mm_sub_ps(ix3,jx2);
250 dy32 = _mm_sub_ps(iy3,jy2);
251 dz32 = _mm_sub_ps(iz3,jz2);
252 dx33 = _mm_sub_ps(ix3,jx3);
253 dy33 = _mm_sub_ps(iy3,jy3);
254 dz33 = _mm_sub_ps(iz3,jz3);
256 /* Calculate squared distance and things based on it */
257 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
258 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
259 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
260 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
261 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
262 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
263 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
264 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
265 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
266 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
268 rinv11 = gmx_mm_invsqrt_ps(rsq11);
269 rinv12 = gmx_mm_invsqrt_ps(rsq12);
270 rinv13 = gmx_mm_invsqrt_ps(rsq13);
271 rinv21 = gmx_mm_invsqrt_ps(rsq21);
272 rinv22 = gmx_mm_invsqrt_ps(rsq22);
273 rinv23 = gmx_mm_invsqrt_ps(rsq23);
274 rinv31 = gmx_mm_invsqrt_ps(rsq31);
275 rinv32 = gmx_mm_invsqrt_ps(rsq32);
276 rinv33 = gmx_mm_invsqrt_ps(rsq33);
278 rinvsq00 = gmx_mm_inv_ps(rsq00);
280 fjx0 = _mm_setzero_ps();
281 fjy0 = _mm_setzero_ps();
282 fjz0 = _mm_setzero_ps();
283 fjx1 = _mm_setzero_ps();
284 fjy1 = _mm_setzero_ps();
285 fjz1 = _mm_setzero_ps();
286 fjx2 = _mm_setzero_ps();
287 fjy2 = _mm_setzero_ps();
288 fjz2 = _mm_setzero_ps();
289 fjx3 = _mm_setzero_ps();
290 fjy3 = _mm_setzero_ps();
291 fjz3 = _mm_setzero_ps();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 /* LENNARD-JONES DISPERSION/REPULSION */
299 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
300 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
301 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
302 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
303 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
310 /* Calculate temporary vectorial force */
311 tx = _mm_mul_ps(fscal,dx00);
312 ty = _mm_mul_ps(fscal,dy00);
313 tz = _mm_mul_ps(fscal,dz00);
315 /* Update vectorial force */
316 fix0 = _mm_add_ps(fix0,tx);
317 fiy0 = _mm_add_ps(fiy0,ty);
318 fiz0 = _mm_add_ps(fiz0,tz);
320 fjx0 = _mm_add_ps(fjx0,tx);
321 fjy0 = _mm_add_ps(fjy0,ty);
322 fjz0 = _mm_add_ps(fjz0,tz);
324 /**************************
325 * CALCULATE INTERACTIONS *
326 **************************/
328 r11 = _mm_mul_ps(rsq11,rinv11);
330 /* Calculate table index by multiplying r with table scale and truncate to integer */
331 rt = _mm_mul_ps(r11,vftabscale);
332 vfitab = _mm_cvttps_epi32(rt);
333 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
334 vfitab = _mm_slli_epi32(vfitab,2);
336 /* CUBIC SPLINE TABLE ELECTROSTATICS */
337 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
338 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
339 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
340 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
341 _MM_TRANSPOSE4_PS(Y,F,G,H);
342 Heps = _mm_mul_ps(vfeps,H);
343 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
344 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
345 velec = _mm_mul_ps(qq11,VV);
346 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
347 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum = _mm_add_ps(velecsum,velec);
354 /* Calculate temporary vectorial force */
355 tx = _mm_mul_ps(fscal,dx11);
356 ty = _mm_mul_ps(fscal,dy11);
357 tz = _mm_mul_ps(fscal,dz11);
359 /* Update vectorial force */
360 fix1 = _mm_add_ps(fix1,tx);
361 fiy1 = _mm_add_ps(fiy1,ty);
362 fiz1 = _mm_add_ps(fiz1,tz);
364 fjx1 = _mm_add_ps(fjx1,tx);
365 fjy1 = _mm_add_ps(fjy1,ty);
366 fjz1 = _mm_add_ps(fjz1,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 r12 = _mm_mul_ps(rsq12,rinv12);
374 /* Calculate table index by multiplying r with table scale and truncate to integer */
375 rt = _mm_mul_ps(r12,vftabscale);
376 vfitab = _mm_cvttps_epi32(rt);
377 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
378 vfitab = _mm_slli_epi32(vfitab,2);
380 /* CUBIC SPLINE TABLE ELECTROSTATICS */
381 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
382 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
383 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
384 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
385 _MM_TRANSPOSE4_PS(Y,F,G,H);
386 Heps = _mm_mul_ps(vfeps,H);
387 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
388 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
389 velec = _mm_mul_ps(qq12,VV);
390 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
391 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
393 /* Update potential sum for this i atom from the interaction with this j atom. */
394 velecsum = _mm_add_ps(velecsum,velec);
398 /* Calculate temporary vectorial force */
399 tx = _mm_mul_ps(fscal,dx12);
400 ty = _mm_mul_ps(fscal,dy12);
401 tz = _mm_mul_ps(fscal,dz12);
403 /* Update vectorial force */
404 fix1 = _mm_add_ps(fix1,tx);
405 fiy1 = _mm_add_ps(fiy1,ty);
406 fiz1 = _mm_add_ps(fiz1,tz);
408 fjx2 = _mm_add_ps(fjx2,tx);
409 fjy2 = _mm_add_ps(fjy2,ty);
410 fjz2 = _mm_add_ps(fjz2,tz);
412 /**************************
413 * CALCULATE INTERACTIONS *
414 **************************/
416 r13 = _mm_mul_ps(rsq13,rinv13);
418 /* Calculate table index by multiplying r with table scale and truncate to integer */
419 rt = _mm_mul_ps(r13,vftabscale);
420 vfitab = _mm_cvttps_epi32(rt);
421 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
422 vfitab = _mm_slli_epi32(vfitab,2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
426 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
427 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
428 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
429 _MM_TRANSPOSE4_PS(Y,F,G,H);
430 Heps = _mm_mul_ps(vfeps,H);
431 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
432 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
433 velec = _mm_mul_ps(qq13,VV);
434 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
435 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
437 /* Update potential sum for this i atom from the interaction with this j atom. */
438 velecsum = _mm_add_ps(velecsum,velec);
442 /* Calculate temporary vectorial force */
443 tx = _mm_mul_ps(fscal,dx13);
444 ty = _mm_mul_ps(fscal,dy13);
445 tz = _mm_mul_ps(fscal,dz13);
447 /* Update vectorial force */
448 fix1 = _mm_add_ps(fix1,tx);
449 fiy1 = _mm_add_ps(fiy1,ty);
450 fiz1 = _mm_add_ps(fiz1,tz);
452 fjx3 = _mm_add_ps(fjx3,tx);
453 fjy3 = _mm_add_ps(fjy3,ty);
454 fjz3 = _mm_add_ps(fjz3,tz);
456 /**************************
457 * CALCULATE INTERACTIONS *
458 **************************/
460 r21 = _mm_mul_ps(rsq21,rinv21);
462 /* Calculate table index by multiplying r with table scale and truncate to integer */
463 rt = _mm_mul_ps(r21,vftabscale);
464 vfitab = _mm_cvttps_epi32(rt);
465 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
466 vfitab = _mm_slli_epi32(vfitab,2);
468 /* CUBIC SPLINE TABLE ELECTROSTATICS */
469 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
470 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
471 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
472 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
473 _MM_TRANSPOSE4_PS(Y,F,G,H);
474 Heps = _mm_mul_ps(vfeps,H);
475 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
476 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
477 velec = _mm_mul_ps(qq21,VV);
478 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
479 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
481 /* Update potential sum for this i atom from the interaction with this j atom. */
482 velecsum = _mm_add_ps(velecsum,velec);
486 /* Calculate temporary vectorial force */
487 tx = _mm_mul_ps(fscal,dx21);
488 ty = _mm_mul_ps(fscal,dy21);
489 tz = _mm_mul_ps(fscal,dz21);
491 /* Update vectorial force */
492 fix2 = _mm_add_ps(fix2,tx);
493 fiy2 = _mm_add_ps(fiy2,ty);
494 fiz2 = _mm_add_ps(fiz2,tz);
496 fjx1 = _mm_add_ps(fjx1,tx);
497 fjy1 = _mm_add_ps(fjy1,ty);
498 fjz1 = _mm_add_ps(fjz1,tz);
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 r22 = _mm_mul_ps(rsq22,rinv22);
506 /* Calculate table index by multiplying r with table scale and truncate to integer */
507 rt = _mm_mul_ps(r22,vftabscale);
508 vfitab = _mm_cvttps_epi32(rt);
509 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
510 vfitab = _mm_slli_epi32(vfitab,2);
512 /* CUBIC SPLINE TABLE ELECTROSTATICS */
513 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
514 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
515 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
516 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
517 _MM_TRANSPOSE4_PS(Y,F,G,H);
518 Heps = _mm_mul_ps(vfeps,H);
519 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
520 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
521 velec = _mm_mul_ps(qq22,VV);
522 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
523 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm_add_ps(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm_mul_ps(fscal,dx22);
532 ty = _mm_mul_ps(fscal,dy22);
533 tz = _mm_mul_ps(fscal,dz22);
535 /* Update vectorial force */
536 fix2 = _mm_add_ps(fix2,tx);
537 fiy2 = _mm_add_ps(fiy2,ty);
538 fiz2 = _mm_add_ps(fiz2,tz);
540 fjx2 = _mm_add_ps(fjx2,tx);
541 fjy2 = _mm_add_ps(fjy2,ty);
542 fjz2 = _mm_add_ps(fjz2,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 r23 = _mm_mul_ps(rsq23,rinv23);
550 /* Calculate table index by multiplying r with table scale and truncate to integer */
551 rt = _mm_mul_ps(r23,vftabscale);
552 vfitab = _mm_cvttps_epi32(rt);
553 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
554 vfitab = _mm_slli_epi32(vfitab,2);
556 /* CUBIC SPLINE TABLE ELECTROSTATICS */
557 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
558 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
559 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
560 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
561 _MM_TRANSPOSE4_PS(Y,F,G,H);
562 Heps = _mm_mul_ps(vfeps,H);
563 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
564 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
565 velec = _mm_mul_ps(qq23,VV);
566 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
567 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
569 /* Update potential sum for this i atom from the interaction with this j atom. */
570 velecsum = _mm_add_ps(velecsum,velec);
574 /* Calculate temporary vectorial force */
575 tx = _mm_mul_ps(fscal,dx23);
576 ty = _mm_mul_ps(fscal,dy23);
577 tz = _mm_mul_ps(fscal,dz23);
579 /* Update vectorial force */
580 fix2 = _mm_add_ps(fix2,tx);
581 fiy2 = _mm_add_ps(fiy2,ty);
582 fiz2 = _mm_add_ps(fiz2,tz);
584 fjx3 = _mm_add_ps(fjx3,tx);
585 fjy3 = _mm_add_ps(fjy3,ty);
586 fjz3 = _mm_add_ps(fjz3,tz);
588 /**************************
589 * CALCULATE INTERACTIONS *
590 **************************/
592 r31 = _mm_mul_ps(rsq31,rinv31);
594 /* Calculate table index by multiplying r with table scale and truncate to integer */
595 rt = _mm_mul_ps(r31,vftabscale);
596 vfitab = _mm_cvttps_epi32(rt);
597 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
598 vfitab = _mm_slli_epi32(vfitab,2);
600 /* CUBIC SPLINE TABLE ELECTROSTATICS */
601 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
602 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
603 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
604 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
605 _MM_TRANSPOSE4_PS(Y,F,G,H);
606 Heps = _mm_mul_ps(vfeps,H);
607 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
608 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
609 velec = _mm_mul_ps(qq31,VV);
610 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
611 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
613 /* Update potential sum for this i atom from the interaction with this j atom. */
614 velecsum = _mm_add_ps(velecsum,velec);
618 /* Calculate temporary vectorial force */
619 tx = _mm_mul_ps(fscal,dx31);
620 ty = _mm_mul_ps(fscal,dy31);
621 tz = _mm_mul_ps(fscal,dz31);
623 /* Update vectorial force */
624 fix3 = _mm_add_ps(fix3,tx);
625 fiy3 = _mm_add_ps(fiy3,ty);
626 fiz3 = _mm_add_ps(fiz3,tz);
628 fjx1 = _mm_add_ps(fjx1,tx);
629 fjy1 = _mm_add_ps(fjy1,ty);
630 fjz1 = _mm_add_ps(fjz1,tz);
632 /**************************
633 * CALCULATE INTERACTIONS *
634 **************************/
636 r32 = _mm_mul_ps(rsq32,rinv32);
638 /* Calculate table index by multiplying r with table scale and truncate to integer */
639 rt = _mm_mul_ps(r32,vftabscale);
640 vfitab = _mm_cvttps_epi32(rt);
641 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
642 vfitab = _mm_slli_epi32(vfitab,2);
644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
645 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
646 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
647 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
648 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
649 _MM_TRANSPOSE4_PS(Y,F,G,H);
650 Heps = _mm_mul_ps(vfeps,H);
651 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
652 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
653 velec = _mm_mul_ps(qq32,VV);
654 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
655 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
657 /* Update potential sum for this i atom from the interaction with this j atom. */
658 velecsum = _mm_add_ps(velecsum,velec);
662 /* Calculate temporary vectorial force */
663 tx = _mm_mul_ps(fscal,dx32);
664 ty = _mm_mul_ps(fscal,dy32);
665 tz = _mm_mul_ps(fscal,dz32);
667 /* Update vectorial force */
668 fix3 = _mm_add_ps(fix3,tx);
669 fiy3 = _mm_add_ps(fiy3,ty);
670 fiz3 = _mm_add_ps(fiz3,tz);
672 fjx2 = _mm_add_ps(fjx2,tx);
673 fjy2 = _mm_add_ps(fjy2,ty);
674 fjz2 = _mm_add_ps(fjz2,tz);
676 /**************************
677 * CALCULATE INTERACTIONS *
678 **************************/
680 r33 = _mm_mul_ps(rsq33,rinv33);
682 /* Calculate table index by multiplying r with table scale and truncate to integer */
683 rt = _mm_mul_ps(r33,vftabscale);
684 vfitab = _mm_cvttps_epi32(rt);
685 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
686 vfitab = _mm_slli_epi32(vfitab,2);
688 /* CUBIC SPLINE TABLE ELECTROSTATICS */
689 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
690 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
691 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
692 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
693 _MM_TRANSPOSE4_PS(Y,F,G,H);
694 Heps = _mm_mul_ps(vfeps,H);
695 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
696 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
697 velec = _mm_mul_ps(qq33,VV);
698 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
699 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
701 /* Update potential sum for this i atom from the interaction with this j atom. */
702 velecsum = _mm_add_ps(velecsum,velec);
706 /* Calculate temporary vectorial force */
707 tx = _mm_mul_ps(fscal,dx33);
708 ty = _mm_mul_ps(fscal,dy33);
709 tz = _mm_mul_ps(fscal,dz33);
711 /* Update vectorial force */
712 fix3 = _mm_add_ps(fix3,tx);
713 fiy3 = _mm_add_ps(fiy3,ty);
714 fiz3 = _mm_add_ps(fiz3,tz);
716 fjx3 = _mm_add_ps(fjx3,tx);
717 fjy3 = _mm_add_ps(fjy3,ty);
718 fjz3 = _mm_add_ps(fjz3,tz);
720 fjptrA = f+j_coord_offsetA;
721 fjptrB = f+j_coord_offsetB;
722 fjptrC = f+j_coord_offsetC;
723 fjptrD = f+j_coord_offsetD;
725 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
726 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
727 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
729 /* Inner loop uses 422 flops */
735 /* Get j neighbor index, and coordinate index */
736 jnrlistA = jjnr[jidx];
737 jnrlistB = jjnr[jidx+1];
738 jnrlistC = jjnr[jidx+2];
739 jnrlistD = jjnr[jidx+3];
740 /* Sign of each element will be negative for non-real atoms.
741 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
742 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
744 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
745 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
746 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
747 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
748 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
749 j_coord_offsetA = DIM*jnrA;
750 j_coord_offsetB = DIM*jnrB;
751 j_coord_offsetC = DIM*jnrC;
752 j_coord_offsetD = DIM*jnrD;
754 /* load j atom coordinates */
755 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
756 x+j_coord_offsetC,x+j_coord_offsetD,
757 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
758 &jy2,&jz2,&jx3,&jy3,&jz3);
760 /* Calculate displacement vector */
761 dx00 = _mm_sub_ps(ix0,jx0);
762 dy00 = _mm_sub_ps(iy0,jy0);
763 dz00 = _mm_sub_ps(iz0,jz0);
764 dx11 = _mm_sub_ps(ix1,jx1);
765 dy11 = _mm_sub_ps(iy1,jy1);
766 dz11 = _mm_sub_ps(iz1,jz1);
767 dx12 = _mm_sub_ps(ix1,jx2);
768 dy12 = _mm_sub_ps(iy1,jy2);
769 dz12 = _mm_sub_ps(iz1,jz2);
770 dx13 = _mm_sub_ps(ix1,jx3);
771 dy13 = _mm_sub_ps(iy1,jy3);
772 dz13 = _mm_sub_ps(iz1,jz3);
773 dx21 = _mm_sub_ps(ix2,jx1);
774 dy21 = _mm_sub_ps(iy2,jy1);
775 dz21 = _mm_sub_ps(iz2,jz1);
776 dx22 = _mm_sub_ps(ix2,jx2);
777 dy22 = _mm_sub_ps(iy2,jy2);
778 dz22 = _mm_sub_ps(iz2,jz2);
779 dx23 = _mm_sub_ps(ix2,jx3);
780 dy23 = _mm_sub_ps(iy2,jy3);
781 dz23 = _mm_sub_ps(iz2,jz3);
782 dx31 = _mm_sub_ps(ix3,jx1);
783 dy31 = _mm_sub_ps(iy3,jy1);
784 dz31 = _mm_sub_ps(iz3,jz1);
785 dx32 = _mm_sub_ps(ix3,jx2);
786 dy32 = _mm_sub_ps(iy3,jy2);
787 dz32 = _mm_sub_ps(iz3,jz2);
788 dx33 = _mm_sub_ps(ix3,jx3);
789 dy33 = _mm_sub_ps(iy3,jy3);
790 dz33 = _mm_sub_ps(iz3,jz3);
792 /* Calculate squared distance and things based on it */
793 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
794 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
795 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
796 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
797 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
798 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
799 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
800 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
801 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
802 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
804 rinv11 = gmx_mm_invsqrt_ps(rsq11);
805 rinv12 = gmx_mm_invsqrt_ps(rsq12);
806 rinv13 = gmx_mm_invsqrt_ps(rsq13);
807 rinv21 = gmx_mm_invsqrt_ps(rsq21);
808 rinv22 = gmx_mm_invsqrt_ps(rsq22);
809 rinv23 = gmx_mm_invsqrt_ps(rsq23);
810 rinv31 = gmx_mm_invsqrt_ps(rsq31);
811 rinv32 = gmx_mm_invsqrt_ps(rsq32);
812 rinv33 = gmx_mm_invsqrt_ps(rsq33);
814 rinvsq00 = gmx_mm_inv_ps(rsq00);
816 fjx0 = _mm_setzero_ps();
817 fjy0 = _mm_setzero_ps();
818 fjz0 = _mm_setzero_ps();
819 fjx1 = _mm_setzero_ps();
820 fjy1 = _mm_setzero_ps();
821 fjz1 = _mm_setzero_ps();
822 fjx2 = _mm_setzero_ps();
823 fjy2 = _mm_setzero_ps();
824 fjz2 = _mm_setzero_ps();
825 fjx3 = _mm_setzero_ps();
826 fjy3 = _mm_setzero_ps();
827 fjz3 = _mm_setzero_ps();
829 /**************************
830 * CALCULATE INTERACTIONS *
831 **************************/
833 /* LENNARD-JONES DISPERSION/REPULSION */
835 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
836 vvdw6 = _mm_mul_ps(c6_00,rinvsix);
837 vvdw12 = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
838 vvdw = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
839 fvdw = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
841 /* Update potential sum for this i atom from the interaction with this j atom. */
842 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
843 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
847 fscal = _mm_andnot_ps(dummy_mask,fscal);
849 /* Calculate temporary vectorial force */
850 tx = _mm_mul_ps(fscal,dx00);
851 ty = _mm_mul_ps(fscal,dy00);
852 tz = _mm_mul_ps(fscal,dz00);
854 /* Update vectorial force */
855 fix0 = _mm_add_ps(fix0,tx);
856 fiy0 = _mm_add_ps(fiy0,ty);
857 fiz0 = _mm_add_ps(fiz0,tz);
859 fjx0 = _mm_add_ps(fjx0,tx);
860 fjy0 = _mm_add_ps(fjy0,ty);
861 fjz0 = _mm_add_ps(fjz0,tz);
863 /**************************
864 * CALCULATE INTERACTIONS *
865 **************************/
867 r11 = _mm_mul_ps(rsq11,rinv11);
868 r11 = _mm_andnot_ps(dummy_mask,r11);
870 /* Calculate table index by multiplying r with table scale and truncate to integer */
871 rt = _mm_mul_ps(r11,vftabscale);
872 vfitab = _mm_cvttps_epi32(rt);
873 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
874 vfitab = _mm_slli_epi32(vfitab,2);
876 /* CUBIC SPLINE TABLE ELECTROSTATICS */
877 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
878 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
879 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
880 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
881 _MM_TRANSPOSE4_PS(Y,F,G,H);
882 Heps = _mm_mul_ps(vfeps,H);
883 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
884 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
885 velec = _mm_mul_ps(qq11,VV);
886 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
887 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm_andnot_ps(dummy_mask,velec);
891 velecsum = _mm_add_ps(velecsum,velec);
895 fscal = _mm_andnot_ps(dummy_mask,fscal);
897 /* Calculate temporary vectorial force */
898 tx = _mm_mul_ps(fscal,dx11);
899 ty = _mm_mul_ps(fscal,dy11);
900 tz = _mm_mul_ps(fscal,dz11);
902 /* Update vectorial force */
903 fix1 = _mm_add_ps(fix1,tx);
904 fiy1 = _mm_add_ps(fiy1,ty);
905 fiz1 = _mm_add_ps(fiz1,tz);
907 fjx1 = _mm_add_ps(fjx1,tx);
908 fjy1 = _mm_add_ps(fjy1,ty);
909 fjz1 = _mm_add_ps(fjz1,tz);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 r12 = _mm_mul_ps(rsq12,rinv12);
916 r12 = _mm_andnot_ps(dummy_mask,r12);
918 /* Calculate table index by multiplying r with table scale and truncate to integer */
919 rt = _mm_mul_ps(r12,vftabscale);
920 vfitab = _mm_cvttps_epi32(rt);
921 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
922 vfitab = _mm_slli_epi32(vfitab,2);
924 /* CUBIC SPLINE TABLE ELECTROSTATICS */
925 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
926 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
927 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
928 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
929 _MM_TRANSPOSE4_PS(Y,F,G,H);
930 Heps = _mm_mul_ps(vfeps,H);
931 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
932 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
933 velec = _mm_mul_ps(qq12,VV);
934 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
935 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
937 /* Update potential sum for this i atom from the interaction with this j atom. */
938 velec = _mm_andnot_ps(dummy_mask,velec);
939 velecsum = _mm_add_ps(velecsum,velec);
943 fscal = _mm_andnot_ps(dummy_mask,fscal);
945 /* Calculate temporary vectorial force */
946 tx = _mm_mul_ps(fscal,dx12);
947 ty = _mm_mul_ps(fscal,dy12);
948 tz = _mm_mul_ps(fscal,dz12);
950 /* Update vectorial force */
951 fix1 = _mm_add_ps(fix1,tx);
952 fiy1 = _mm_add_ps(fiy1,ty);
953 fiz1 = _mm_add_ps(fiz1,tz);
955 fjx2 = _mm_add_ps(fjx2,tx);
956 fjy2 = _mm_add_ps(fjy2,ty);
957 fjz2 = _mm_add_ps(fjz2,tz);
959 /**************************
960 * CALCULATE INTERACTIONS *
961 **************************/
963 r13 = _mm_mul_ps(rsq13,rinv13);
964 r13 = _mm_andnot_ps(dummy_mask,r13);
966 /* Calculate table index by multiplying r with table scale and truncate to integer */
967 rt = _mm_mul_ps(r13,vftabscale);
968 vfitab = _mm_cvttps_epi32(rt);
969 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
970 vfitab = _mm_slli_epi32(vfitab,2);
972 /* CUBIC SPLINE TABLE ELECTROSTATICS */
973 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
974 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
975 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
976 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
977 _MM_TRANSPOSE4_PS(Y,F,G,H);
978 Heps = _mm_mul_ps(vfeps,H);
979 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
980 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
981 velec = _mm_mul_ps(qq13,VV);
982 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
983 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
985 /* Update potential sum for this i atom from the interaction with this j atom. */
986 velec = _mm_andnot_ps(dummy_mask,velec);
987 velecsum = _mm_add_ps(velecsum,velec);
991 fscal = _mm_andnot_ps(dummy_mask,fscal);
993 /* Calculate temporary vectorial force */
994 tx = _mm_mul_ps(fscal,dx13);
995 ty = _mm_mul_ps(fscal,dy13);
996 tz = _mm_mul_ps(fscal,dz13);
998 /* Update vectorial force */
999 fix1 = _mm_add_ps(fix1,tx);
1000 fiy1 = _mm_add_ps(fiy1,ty);
1001 fiz1 = _mm_add_ps(fiz1,tz);
1003 fjx3 = _mm_add_ps(fjx3,tx);
1004 fjy3 = _mm_add_ps(fjy3,ty);
1005 fjz3 = _mm_add_ps(fjz3,tz);
1007 /**************************
1008 * CALCULATE INTERACTIONS *
1009 **************************/
1011 r21 = _mm_mul_ps(rsq21,rinv21);
1012 r21 = _mm_andnot_ps(dummy_mask,r21);
1014 /* Calculate table index by multiplying r with table scale and truncate to integer */
1015 rt = _mm_mul_ps(r21,vftabscale);
1016 vfitab = _mm_cvttps_epi32(rt);
1017 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1018 vfitab = _mm_slli_epi32(vfitab,2);
1020 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1021 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1022 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1023 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1024 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1025 _MM_TRANSPOSE4_PS(Y,F,G,H);
1026 Heps = _mm_mul_ps(vfeps,H);
1027 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1028 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1029 velec = _mm_mul_ps(qq21,VV);
1030 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1031 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1033 /* Update potential sum for this i atom from the interaction with this j atom. */
1034 velec = _mm_andnot_ps(dummy_mask,velec);
1035 velecsum = _mm_add_ps(velecsum,velec);
1039 fscal = _mm_andnot_ps(dummy_mask,fscal);
1041 /* Calculate temporary vectorial force */
1042 tx = _mm_mul_ps(fscal,dx21);
1043 ty = _mm_mul_ps(fscal,dy21);
1044 tz = _mm_mul_ps(fscal,dz21);
1046 /* Update vectorial force */
1047 fix2 = _mm_add_ps(fix2,tx);
1048 fiy2 = _mm_add_ps(fiy2,ty);
1049 fiz2 = _mm_add_ps(fiz2,tz);
1051 fjx1 = _mm_add_ps(fjx1,tx);
1052 fjy1 = _mm_add_ps(fjy1,ty);
1053 fjz1 = _mm_add_ps(fjz1,tz);
1055 /**************************
1056 * CALCULATE INTERACTIONS *
1057 **************************/
1059 r22 = _mm_mul_ps(rsq22,rinv22);
1060 r22 = _mm_andnot_ps(dummy_mask,r22);
1062 /* Calculate table index by multiplying r with table scale and truncate to integer */
1063 rt = _mm_mul_ps(r22,vftabscale);
1064 vfitab = _mm_cvttps_epi32(rt);
1065 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1066 vfitab = _mm_slli_epi32(vfitab,2);
1068 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1069 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1070 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1071 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1072 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1073 _MM_TRANSPOSE4_PS(Y,F,G,H);
1074 Heps = _mm_mul_ps(vfeps,H);
1075 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1076 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1077 velec = _mm_mul_ps(qq22,VV);
1078 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1079 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1081 /* Update potential sum for this i atom from the interaction with this j atom. */
1082 velec = _mm_andnot_ps(dummy_mask,velec);
1083 velecsum = _mm_add_ps(velecsum,velec);
1087 fscal = _mm_andnot_ps(dummy_mask,fscal);
1089 /* Calculate temporary vectorial force */
1090 tx = _mm_mul_ps(fscal,dx22);
1091 ty = _mm_mul_ps(fscal,dy22);
1092 tz = _mm_mul_ps(fscal,dz22);
1094 /* Update vectorial force */
1095 fix2 = _mm_add_ps(fix2,tx);
1096 fiy2 = _mm_add_ps(fiy2,ty);
1097 fiz2 = _mm_add_ps(fiz2,tz);
1099 fjx2 = _mm_add_ps(fjx2,tx);
1100 fjy2 = _mm_add_ps(fjy2,ty);
1101 fjz2 = _mm_add_ps(fjz2,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 r23 = _mm_mul_ps(rsq23,rinv23);
1108 r23 = _mm_andnot_ps(dummy_mask,r23);
1110 /* Calculate table index by multiplying r with table scale and truncate to integer */
1111 rt = _mm_mul_ps(r23,vftabscale);
1112 vfitab = _mm_cvttps_epi32(rt);
1113 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1114 vfitab = _mm_slli_epi32(vfitab,2);
1116 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1117 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1118 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1119 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1120 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1121 _MM_TRANSPOSE4_PS(Y,F,G,H);
1122 Heps = _mm_mul_ps(vfeps,H);
1123 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1124 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1125 velec = _mm_mul_ps(qq23,VV);
1126 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1127 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1129 /* Update potential sum for this i atom from the interaction with this j atom. */
1130 velec = _mm_andnot_ps(dummy_mask,velec);
1131 velecsum = _mm_add_ps(velecsum,velec);
1135 fscal = _mm_andnot_ps(dummy_mask,fscal);
1137 /* Calculate temporary vectorial force */
1138 tx = _mm_mul_ps(fscal,dx23);
1139 ty = _mm_mul_ps(fscal,dy23);
1140 tz = _mm_mul_ps(fscal,dz23);
1142 /* Update vectorial force */
1143 fix2 = _mm_add_ps(fix2,tx);
1144 fiy2 = _mm_add_ps(fiy2,ty);
1145 fiz2 = _mm_add_ps(fiz2,tz);
1147 fjx3 = _mm_add_ps(fjx3,tx);
1148 fjy3 = _mm_add_ps(fjy3,ty);
1149 fjz3 = _mm_add_ps(fjz3,tz);
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 r31 = _mm_mul_ps(rsq31,rinv31);
1156 r31 = _mm_andnot_ps(dummy_mask,r31);
1158 /* Calculate table index by multiplying r with table scale and truncate to integer */
1159 rt = _mm_mul_ps(r31,vftabscale);
1160 vfitab = _mm_cvttps_epi32(rt);
1161 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1162 vfitab = _mm_slli_epi32(vfitab,2);
1164 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1165 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1166 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1167 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1168 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1169 _MM_TRANSPOSE4_PS(Y,F,G,H);
1170 Heps = _mm_mul_ps(vfeps,H);
1171 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1172 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1173 velec = _mm_mul_ps(qq31,VV);
1174 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1175 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm_andnot_ps(dummy_mask,velec);
1179 velecsum = _mm_add_ps(velecsum,velec);
1183 fscal = _mm_andnot_ps(dummy_mask,fscal);
1185 /* Calculate temporary vectorial force */
1186 tx = _mm_mul_ps(fscal,dx31);
1187 ty = _mm_mul_ps(fscal,dy31);
1188 tz = _mm_mul_ps(fscal,dz31);
1190 /* Update vectorial force */
1191 fix3 = _mm_add_ps(fix3,tx);
1192 fiy3 = _mm_add_ps(fiy3,ty);
1193 fiz3 = _mm_add_ps(fiz3,tz);
1195 fjx1 = _mm_add_ps(fjx1,tx);
1196 fjy1 = _mm_add_ps(fjy1,ty);
1197 fjz1 = _mm_add_ps(fjz1,tz);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 r32 = _mm_mul_ps(rsq32,rinv32);
1204 r32 = _mm_andnot_ps(dummy_mask,r32);
1206 /* Calculate table index by multiplying r with table scale and truncate to integer */
1207 rt = _mm_mul_ps(r32,vftabscale);
1208 vfitab = _mm_cvttps_epi32(rt);
1209 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1210 vfitab = _mm_slli_epi32(vfitab,2);
1212 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1213 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1214 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1215 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1216 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1217 _MM_TRANSPOSE4_PS(Y,F,G,H);
1218 Heps = _mm_mul_ps(vfeps,H);
1219 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1220 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1221 velec = _mm_mul_ps(qq32,VV);
1222 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1223 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1225 /* Update potential sum for this i atom from the interaction with this j atom. */
1226 velec = _mm_andnot_ps(dummy_mask,velec);
1227 velecsum = _mm_add_ps(velecsum,velec);
1231 fscal = _mm_andnot_ps(dummy_mask,fscal);
1233 /* Calculate temporary vectorial force */
1234 tx = _mm_mul_ps(fscal,dx32);
1235 ty = _mm_mul_ps(fscal,dy32);
1236 tz = _mm_mul_ps(fscal,dz32);
1238 /* Update vectorial force */
1239 fix3 = _mm_add_ps(fix3,tx);
1240 fiy3 = _mm_add_ps(fiy3,ty);
1241 fiz3 = _mm_add_ps(fiz3,tz);
1243 fjx2 = _mm_add_ps(fjx2,tx);
1244 fjy2 = _mm_add_ps(fjy2,ty);
1245 fjz2 = _mm_add_ps(fjz2,tz);
1247 /**************************
1248 * CALCULATE INTERACTIONS *
1249 **************************/
1251 r33 = _mm_mul_ps(rsq33,rinv33);
1252 r33 = _mm_andnot_ps(dummy_mask,r33);
1254 /* Calculate table index by multiplying r with table scale and truncate to integer */
1255 rt = _mm_mul_ps(r33,vftabscale);
1256 vfitab = _mm_cvttps_epi32(rt);
1257 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1258 vfitab = _mm_slli_epi32(vfitab,2);
1260 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1261 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1262 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1263 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1264 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1265 _MM_TRANSPOSE4_PS(Y,F,G,H);
1266 Heps = _mm_mul_ps(vfeps,H);
1267 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1268 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1269 velec = _mm_mul_ps(qq33,VV);
1270 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1271 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1273 /* Update potential sum for this i atom from the interaction with this j atom. */
1274 velec = _mm_andnot_ps(dummy_mask,velec);
1275 velecsum = _mm_add_ps(velecsum,velec);
1279 fscal = _mm_andnot_ps(dummy_mask,fscal);
1281 /* Calculate temporary vectorial force */
1282 tx = _mm_mul_ps(fscal,dx33);
1283 ty = _mm_mul_ps(fscal,dy33);
1284 tz = _mm_mul_ps(fscal,dz33);
1286 /* Update vectorial force */
1287 fix3 = _mm_add_ps(fix3,tx);
1288 fiy3 = _mm_add_ps(fiy3,ty);
1289 fiz3 = _mm_add_ps(fiz3,tz);
1291 fjx3 = _mm_add_ps(fjx3,tx);
1292 fjy3 = _mm_add_ps(fjy3,ty);
1293 fjz3 = _mm_add_ps(fjz3,tz);
1295 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1296 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1297 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1298 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1300 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1301 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1302 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1304 /* Inner loop uses 431 flops */
1307 /* End of innermost loop */
1309 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1310 f+i_coord_offset,fshift+i_shift_offset);
1313 /* Update potential energies */
1314 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1315 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1317 /* Increment number of inner iterations */
1318 inneriter += j_index_end - j_index_start;
1320 /* Outer loop uses 26 flops */
1323 /* Increment number of outer iterations */
1326 /* Update outer/inner flops */
1328 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
1331 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
1332 * Electrostatics interaction: CubicSplineTable
1333 * VdW interaction: LennardJones
1334 * Geometry: Water4-Water4
1335 * Calculate force/pot: Force
1338 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_sse4_1_single
1339 (t_nblist * gmx_restrict nlist,
1340 rvec * gmx_restrict xx,
1341 rvec * gmx_restrict ff,
1342 t_forcerec * gmx_restrict fr,
1343 t_mdatoms * gmx_restrict mdatoms,
1344 nb_kernel_data_t * gmx_restrict kernel_data,
1345 t_nrnb * gmx_restrict nrnb)
1347 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1348 * just 0 for non-waters.
1349 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1350 * jnr indices corresponding to data put in the four positions in the SIMD register.
1352 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1353 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1354 int jnrA,jnrB,jnrC,jnrD;
1355 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1356 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1357 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1358 real rcutoff_scalar;
1359 real *shiftvec,*fshift,*x,*f;
1360 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1361 real scratch[4*DIM];
1362 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1364 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1366 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1368 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1370 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1371 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1372 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1373 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1374 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1375 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1376 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1377 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1378 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1379 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1380 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1381 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1382 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1383 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1384 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1385 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1386 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1387 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1388 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1389 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1392 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1395 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1396 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1398 __m128i ifour = _mm_set1_epi32(4);
1399 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1401 __m128 dummy_mask,cutoff_mask;
1402 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1403 __m128 one = _mm_set1_ps(1.0);
1404 __m128 two = _mm_set1_ps(2.0);
1410 jindex = nlist->jindex;
1412 shiftidx = nlist->shift;
1414 shiftvec = fr->shift_vec[0];
1415 fshift = fr->fshift[0];
1416 facel = _mm_set1_ps(fr->epsfac);
1417 charge = mdatoms->chargeA;
1418 nvdwtype = fr->ntype;
1419 vdwparam = fr->nbfp;
1420 vdwtype = mdatoms->typeA;
1422 vftab = kernel_data->table_elec->data;
1423 vftabscale = _mm_set1_ps(kernel_data->table_elec->scale);
1425 /* Setup water-specific parameters */
1426 inr = nlist->iinr[0];
1427 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1428 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1429 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1430 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1432 jq1 = _mm_set1_ps(charge[inr+1]);
1433 jq2 = _mm_set1_ps(charge[inr+2]);
1434 jq3 = _mm_set1_ps(charge[inr+3]);
1435 vdwjidx0A = 2*vdwtype[inr+0];
1436 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1437 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1438 qq11 = _mm_mul_ps(iq1,jq1);
1439 qq12 = _mm_mul_ps(iq1,jq2);
1440 qq13 = _mm_mul_ps(iq1,jq3);
1441 qq21 = _mm_mul_ps(iq2,jq1);
1442 qq22 = _mm_mul_ps(iq2,jq2);
1443 qq23 = _mm_mul_ps(iq2,jq3);
1444 qq31 = _mm_mul_ps(iq3,jq1);
1445 qq32 = _mm_mul_ps(iq3,jq2);
1446 qq33 = _mm_mul_ps(iq3,jq3);
1448 /* Avoid stupid compiler warnings */
1449 jnrA = jnrB = jnrC = jnrD = 0;
1450 j_coord_offsetA = 0;
1451 j_coord_offsetB = 0;
1452 j_coord_offsetC = 0;
1453 j_coord_offsetD = 0;
1458 for(iidx=0;iidx<4*DIM;iidx++)
1460 scratch[iidx] = 0.0;
1463 /* Start outer loop over neighborlists */
1464 for(iidx=0; iidx<nri; iidx++)
1466 /* Load shift vector for this list */
1467 i_shift_offset = DIM*shiftidx[iidx];
1469 /* Load limits for loop over neighbors */
1470 j_index_start = jindex[iidx];
1471 j_index_end = jindex[iidx+1];
1473 /* Get outer coordinate index */
1475 i_coord_offset = DIM*inr;
1477 /* Load i particle coords and add shift vector */
1478 gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1479 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1481 fix0 = _mm_setzero_ps();
1482 fiy0 = _mm_setzero_ps();
1483 fiz0 = _mm_setzero_ps();
1484 fix1 = _mm_setzero_ps();
1485 fiy1 = _mm_setzero_ps();
1486 fiz1 = _mm_setzero_ps();
1487 fix2 = _mm_setzero_ps();
1488 fiy2 = _mm_setzero_ps();
1489 fiz2 = _mm_setzero_ps();
1490 fix3 = _mm_setzero_ps();
1491 fiy3 = _mm_setzero_ps();
1492 fiz3 = _mm_setzero_ps();
1494 /* Start inner kernel loop */
1495 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1498 /* Get j neighbor index, and coordinate index */
1500 jnrB = jjnr[jidx+1];
1501 jnrC = jjnr[jidx+2];
1502 jnrD = jjnr[jidx+3];
1503 j_coord_offsetA = DIM*jnrA;
1504 j_coord_offsetB = DIM*jnrB;
1505 j_coord_offsetC = DIM*jnrC;
1506 j_coord_offsetD = DIM*jnrD;
1508 /* load j atom coordinates */
1509 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1510 x+j_coord_offsetC,x+j_coord_offsetD,
1511 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1512 &jy2,&jz2,&jx3,&jy3,&jz3);
1514 /* Calculate displacement vector */
1515 dx00 = _mm_sub_ps(ix0,jx0);
1516 dy00 = _mm_sub_ps(iy0,jy0);
1517 dz00 = _mm_sub_ps(iz0,jz0);
1518 dx11 = _mm_sub_ps(ix1,jx1);
1519 dy11 = _mm_sub_ps(iy1,jy1);
1520 dz11 = _mm_sub_ps(iz1,jz1);
1521 dx12 = _mm_sub_ps(ix1,jx2);
1522 dy12 = _mm_sub_ps(iy1,jy2);
1523 dz12 = _mm_sub_ps(iz1,jz2);
1524 dx13 = _mm_sub_ps(ix1,jx3);
1525 dy13 = _mm_sub_ps(iy1,jy3);
1526 dz13 = _mm_sub_ps(iz1,jz3);
1527 dx21 = _mm_sub_ps(ix2,jx1);
1528 dy21 = _mm_sub_ps(iy2,jy1);
1529 dz21 = _mm_sub_ps(iz2,jz1);
1530 dx22 = _mm_sub_ps(ix2,jx2);
1531 dy22 = _mm_sub_ps(iy2,jy2);
1532 dz22 = _mm_sub_ps(iz2,jz2);
1533 dx23 = _mm_sub_ps(ix2,jx3);
1534 dy23 = _mm_sub_ps(iy2,jy3);
1535 dz23 = _mm_sub_ps(iz2,jz3);
1536 dx31 = _mm_sub_ps(ix3,jx1);
1537 dy31 = _mm_sub_ps(iy3,jy1);
1538 dz31 = _mm_sub_ps(iz3,jz1);
1539 dx32 = _mm_sub_ps(ix3,jx2);
1540 dy32 = _mm_sub_ps(iy3,jy2);
1541 dz32 = _mm_sub_ps(iz3,jz2);
1542 dx33 = _mm_sub_ps(ix3,jx3);
1543 dy33 = _mm_sub_ps(iy3,jy3);
1544 dz33 = _mm_sub_ps(iz3,jz3);
1546 /* Calculate squared distance and things based on it */
1547 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1548 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1549 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1550 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1551 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1552 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1553 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1554 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1555 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1556 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1558 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1559 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1560 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1561 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1562 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1563 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1564 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1565 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1566 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1568 rinvsq00 = gmx_mm_inv_ps(rsq00);
1570 fjx0 = _mm_setzero_ps();
1571 fjy0 = _mm_setzero_ps();
1572 fjz0 = _mm_setzero_ps();
1573 fjx1 = _mm_setzero_ps();
1574 fjy1 = _mm_setzero_ps();
1575 fjz1 = _mm_setzero_ps();
1576 fjx2 = _mm_setzero_ps();
1577 fjy2 = _mm_setzero_ps();
1578 fjz2 = _mm_setzero_ps();
1579 fjx3 = _mm_setzero_ps();
1580 fjy3 = _mm_setzero_ps();
1581 fjz3 = _mm_setzero_ps();
1583 /**************************
1584 * CALCULATE INTERACTIONS *
1585 **************************/
1587 /* LENNARD-JONES DISPERSION/REPULSION */
1589 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1590 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1594 /* Calculate temporary vectorial force */
1595 tx = _mm_mul_ps(fscal,dx00);
1596 ty = _mm_mul_ps(fscal,dy00);
1597 tz = _mm_mul_ps(fscal,dz00);
1599 /* Update vectorial force */
1600 fix0 = _mm_add_ps(fix0,tx);
1601 fiy0 = _mm_add_ps(fiy0,ty);
1602 fiz0 = _mm_add_ps(fiz0,tz);
1604 fjx0 = _mm_add_ps(fjx0,tx);
1605 fjy0 = _mm_add_ps(fjy0,ty);
1606 fjz0 = _mm_add_ps(fjz0,tz);
1608 /**************************
1609 * CALCULATE INTERACTIONS *
1610 **************************/
1612 r11 = _mm_mul_ps(rsq11,rinv11);
1614 /* Calculate table index by multiplying r with table scale and truncate to integer */
1615 rt = _mm_mul_ps(r11,vftabscale);
1616 vfitab = _mm_cvttps_epi32(rt);
1617 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1618 vfitab = _mm_slli_epi32(vfitab,2);
1620 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1621 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1622 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1623 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1624 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1625 _MM_TRANSPOSE4_PS(Y,F,G,H);
1626 Heps = _mm_mul_ps(vfeps,H);
1627 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1628 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1629 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1633 /* Calculate temporary vectorial force */
1634 tx = _mm_mul_ps(fscal,dx11);
1635 ty = _mm_mul_ps(fscal,dy11);
1636 tz = _mm_mul_ps(fscal,dz11);
1638 /* Update vectorial force */
1639 fix1 = _mm_add_ps(fix1,tx);
1640 fiy1 = _mm_add_ps(fiy1,ty);
1641 fiz1 = _mm_add_ps(fiz1,tz);
1643 fjx1 = _mm_add_ps(fjx1,tx);
1644 fjy1 = _mm_add_ps(fjy1,ty);
1645 fjz1 = _mm_add_ps(fjz1,tz);
1647 /**************************
1648 * CALCULATE INTERACTIONS *
1649 **************************/
1651 r12 = _mm_mul_ps(rsq12,rinv12);
1653 /* Calculate table index by multiplying r with table scale and truncate to integer */
1654 rt = _mm_mul_ps(r12,vftabscale);
1655 vfitab = _mm_cvttps_epi32(rt);
1656 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1657 vfitab = _mm_slli_epi32(vfitab,2);
1659 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1660 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1661 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1662 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1663 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1664 _MM_TRANSPOSE4_PS(Y,F,G,H);
1665 Heps = _mm_mul_ps(vfeps,H);
1666 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1667 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1668 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1672 /* Calculate temporary vectorial force */
1673 tx = _mm_mul_ps(fscal,dx12);
1674 ty = _mm_mul_ps(fscal,dy12);
1675 tz = _mm_mul_ps(fscal,dz12);
1677 /* Update vectorial force */
1678 fix1 = _mm_add_ps(fix1,tx);
1679 fiy1 = _mm_add_ps(fiy1,ty);
1680 fiz1 = _mm_add_ps(fiz1,tz);
1682 fjx2 = _mm_add_ps(fjx2,tx);
1683 fjy2 = _mm_add_ps(fjy2,ty);
1684 fjz2 = _mm_add_ps(fjz2,tz);
1686 /**************************
1687 * CALCULATE INTERACTIONS *
1688 **************************/
1690 r13 = _mm_mul_ps(rsq13,rinv13);
1692 /* Calculate table index by multiplying r with table scale and truncate to integer */
1693 rt = _mm_mul_ps(r13,vftabscale);
1694 vfitab = _mm_cvttps_epi32(rt);
1695 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1696 vfitab = _mm_slli_epi32(vfitab,2);
1698 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1699 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1700 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1701 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1702 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1703 _MM_TRANSPOSE4_PS(Y,F,G,H);
1704 Heps = _mm_mul_ps(vfeps,H);
1705 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1706 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1707 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1711 /* Calculate temporary vectorial force */
1712 tx = _mm_mul_ps(fscal,dx13);
1713 ty = _mm_mul_ps(fscal,dy13);
1714 tz = _mm_mul_ps(fscal,dz13);
1716 /* Update vectorial force */
1717 fix1 = _mm_add_ps(fix1,tx);
1718 fiy1 = _mm_add_ps(fiy1,ty);
1719 fiz1 = _mm_add_ps(fiz1,tz);
1721 fjx3 = _mm_add_ps(fjx3,tx);
1722 fjy3 = _mm_add_ps(fjy3,ty);
1723 fjz3 = _mm_add_ps(fjz3,tz);
1725 /**************************
1726 * CALCULATE INTERACTIONS *
1727 **************************/
1729 r21 = _mm_mul_ps(rsq21,rinv21);
1731 /* Calculate table index by multiplying r with table scale and truncate to integer */
1732 rt = _mm_mul_ps(r21,vftabscale);
1733 vfitab = _mm_cvttps_epi32(rt);
1734 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1735 vfitab = _mm_slli_epi32(vfitab,2);
1737 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1738 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1739 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1740 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1741 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1742 _MM_TRANSPOSE4_PS(Y,F,G,H);
1743 Heps = _mm_mul_ps(vfeps,H);
1744 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1745 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1746 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1750 /* Calculate temporary vectorial force */
1751 tx = _mm_mul_ps(fscal,dx21);
1752 ty = _mm_mul_ps(fscal,dy21);
1753 tz = _mm_mul_ps(fscal,dz21);
1755 /* Update vectorial force */
1756 fix2 = _mm_add_ps(fix2,tx);
1757 fiy2 = _mm_add_ps(fiy2,ty);
1758 fiz2 = _mm_add_ps(fiz2,tz);
1760 fjx1 = _mm_add_ps(fjx1,tx);
1761 fjy1 = _mm_add_ps(fjy1,ty);
1762 fjz1 = _mm_add_ps(fjz1,tz);
1764 /**************************
1765 * CALCULATE INTERACTIONS *
1766 **************************/
1768 r22 = _mm_mul_ps(rsq22,rinv22);
1770 /* Calculate table index by multiplying r with table scale and truncate to integer */
1771 rt = _mm_mul_ps(r22,vftabscale);
1772 vfitab = _mm_cvttps_epi32(rt);
1773 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1774 vfitab = _mm_slli_epi32(vfitab,2);
1776 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1777 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1778 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1779 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1780 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1781 _MM_TRANSPOSE4_PS(Y,F,G,H);
1782 Heps = _mm_mul_ps(vfeps,H);
1783 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1784 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1785 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1789 /* Calculate temporary vectorial force */
1790 tx = _mm_mul_ps(fscal,dx22);
1791 ty = _mm_mul_ps(fscal,dy22);
1792 tz = _mm_mul_ps(fscal,dz22);
1794 /* Update vectorial force */
1795 fix2 = _mm_add_ps(fix2,tx);
1796 fiy2 = _mm_add_ps(fiy2,ty);
1797 fiz2 = _mm_add_ps(fiz2,tz);
1799 fjx2 = _mm_add_ps(fjx2,tx);
1800 fjy2 = _mm_add_ps(fjy2,ty);
1801 fjz2 = _mm_add_ps(fjz2,tz);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 r23 = _mm_mul_ps(rsq23,rinv23);
1809 /* Calculate table index by multiplying r with table scale and truncate to integer */
1810 rt = _mm_mul_ps(r23,vftabscale);
1811 vfitab = _mm_cvttps_epi32(rt);
1812 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1813 vfitab = _mm_slli_epi32(vfitab,2);
1815 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1816 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1817 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1818 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1819 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1820 _MM_TRANSPOSE4_PS(Y,F,G,H);
1821 Heps = _mm_mul_ps(vfeps,H);
1822 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1823 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1824 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1828 /* Calculate temporary vectorial force */
1829 tx = _mm_mul_ps(fscal,dx23);
1830 ty = _mm_mul_ps(fscal,dy23);
1831 tz = _mm_mul_ps(fscal,dz23);
1833 /* Update vectorial force */
1834 fix2 = _mm_add_ps(fix2,tx);
1835 fiy2 = _mm_add_ps(fiy2,ty);
1836 fiz2 = _mm_add_ps(fiz2,tz);
1838 fjx3 = _mm_add_ps(fjx3,tx);
1839 fjy3 = _mm_add_ps(fjy3,ty);
1840 fjz3 = _mm_add_ps(fjz3,tz);
1842 /**************************
1843 * CALCULATE INTERACTIONS *
1844 **************************/
1846 r31 = _mm_mul_ps(rsq31,rinv31);
1848 /* Calculate table index by multiplying r with table scale and truncate to integer */
1849 rt = _mm_mul_ps(r31,vftabscale);
1850 vfitab = _mm_cvttps_epi32(rt);
1851 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1852 vfitab = _mm_slli_epi32(vfitab,2);
1854 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1855 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1856 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1857 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1858 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1859 _MM_TRANSPOSE4_PS(Y,F,G,H);
1860 Heps = _mm_mul_ps(vfeps,H);
1861 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1862 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1863 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1867 /* Calculate temporary vectorial force */
1868 tx = _mm_mul_ps(fscal,dx31);
1869 ty = _mm_mul_ps(fscal,dy31);
1870 tz = _mm_mul_ps(fscal,dz31);
1872 /* Update vectorial force */
1873 fix3 = _mm_add_ps(fix3,tx);
1874 fiy3 = _mm_add_ps(fiy3,ty);
1875 fiz3 = _mm_add_ps(fiz3,tz);
1877 fjx1 = _mm_add_ps(fjx1,tx);
1878 fjy1 = _mm_add_ps(fjy1,ty);
1879 fjz1 = _mm_add_ps(fjz1,tz);
1881 /**************************
1882 * CALCULATE INTERACTIONS *
1883 **************************/
1885 r32 = _mm_mul_ps(rsq32,rinv32);
1887 /* Calculate table index by multiplying r with table scale and truncate to integer */
1888 rt = _mm_mul_ps(r32,vftabscale);
1889 vfitab = _mm_cvttps_epi32(rt);
1890 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1891 vfitab = _mm_slli_epi32(vfitab,2);
1893 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1894 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1895 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1896 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1897 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1898 _MM_TRANSPOSE4_PS(Y,F,G,H);
1899 Heps = _mm_mul_ps(vfeps,H);
1900 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1901 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1902 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1906 /* Calculate temporary vectorial force */
1907 tx = _mm_mul_ps(fscal,dx32);
1908 ty = _mm_mul_ps(fscal,dy32);
1909 tz = _mm_mul_ps(fscal,dz32);
1911 /* Update vectorial force */
1912 fix3 = _mm_add_ps(fix3,tx);
1913 fiy3 = _mm_add_ps(fiy3,ty);
1914 fiz3 = _mm_add_ps(fiz3,tz);
1916 fjx2 = _mm_add_ps(fjx2,tx);
1917 fjy2 = _mm_add_ps(fjy2,ty);
1918 fjz2 = _mm_add_ps(fjz2,tz);
1920 /**************************
1921 * CALCULATE INTERACTIONS *
1922 **************************/
1924 r33 = _mm_mul_ps(rsq33,rinv33);
1926 /* Calculate table index by multiplying r with table scale and truncate to integer */
1927 rt = _mm_mul_ps(r33,vftabscale);
1928 vfitab = _mm_cvttps_epi32(rt);
1929 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1930 vfitab = _mm_slli_epi32(vfitab,2);
1932 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1933 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1934 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1935 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1936 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1937 _MM_TRANSPOSE4_PS(Y,F,G,H);
1938 Heps = _mm_mul_ps(vfeps,H);
1939 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1940 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1941 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1945 /* Calculate temporary vectorial force */
1946 tx = _mm_mul_ps(fscal,dx33);
1947 ty = _mm_mul_ps(fscal,dy33);
1948 tz = _mm_mul_ps(fscal,dz33);
1950 /* Update vectorial force */
1951 fix3 = _mm_add_ps(fix3,tx);
1952 fiy3 = _mm_add_ps(fiy3,ty);
1953 fiz3 = _mm_add_ps(fiz3,tz);
1955 fjx3 = _mm_add_ps(fjx3,tx);
1956 fjy3 = _mm_add_ps(fjy3,ty);
1957 fjz3 = _mm_add_ps(fjz3,tz);
1959 fjptrA = f+j_coord_offsetA;
1960 fjptrB = f+j_coord_offsetB;
1961 fjptrC = f+j_coord_offsetC;
1962 fjptrD = f+j_coord_offsetD;
1964 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1965 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1966 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1968 /* Inner loop uses 381 flops */
1971 if(jidx<j_index_end)
1974 /* Get j neighbor index, and coordinate index */
1975 jnrlistA = jjnr[jidx];
1976 jnrlistB = jjnr[jidx+1];
1977 jnrlistC = jjnr[jidx+2];
1978 jnrlistD = jjnr[jidx+3];
1979 /* Sign of each element will be negative for non-real atoms.
1980 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1981 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1983 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1984 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1985 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1986 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1987 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1988 j_coord_offsetA = DIM*jnrA;
1989 j_coord_offsetB = DIM*jnrB;
1990 j_coord_offsetC = DIM*jnrC;
1991 j_coord_offsetD = DIM*jnrD;
1993 /* load j atom coordinates */
1994 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1995 x+j_coord_offsetC,x+j_coord_offsetD,
1996 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1997 &jy2,&jz2,&jx3,&jy3,&jz3);
1999 /* Calculate displacement vector */
2000 dx00 = _mm_sub_ps(ix0,jx0);
2001 dy00 = _mm_sub_ps(iy0,jy0);
2002 dz00 = _mm_sub_ps(iz0,jz0);
2003 dx11 = _mm_sub_ps(ix1,jx1);
2004 dy11 = _mm_sub_ps(iy1,jy1);
2005 dz11 = _mm_sub_ps(iz1,jz1);
2006 dx12 = _mm_sub_ps(ix1,jx2);
2007 dy12 = _mm_sub_ps(iy1,jy2);
2008 dz12 = _mm_sub_ps(iz1,jz2);
2009 dx13 = _mm_sub_ps(ix1,jx3);
2010 dy13 = _mm_sub_ps(iy1,jy3);
2011 dz13 = _mm_sub_ps(iz1,jz3);
2012 dx21 = _mm_sub_ps(ix2,jx1);
2013 dy21 = _mm_sub_ps(iy2,jy1);
2014 dz21 = _mm_sub_ps(iz2,jz1);
2015 dx22 = _mm_sub_ps(ix2,jx2);
2016 dy22 = _mm_sub_ps(iy2,jy2);
2017 dz22 = _mm_sub_ps(iz2,jz2);
2018 dx23 = _mm_sub_ps(ix2,jx3);
2019 dy23 = _mm_sub_ps(iy2,jy3);
2020 dz23 = _mm_sub_ps(iz2,jz3);
2021 dx31 = _mm_sub_ps(ix3,jx1);
2022 dy31 = _mm_sub_ps(iy3,jy1);
2023 dz31 = _mm_sub_ps(iz3,jz1);
2024 dx32 = _mm_sub_ps(ix3,jx2);
2025 dy32 = _mm_sub_ps(iy3,jy2);
2026 dz32 = _mm_sub_ps(iz3,jz2);
2027 dx33 = _mm_sub_ps(ix3,jx3);
2028 dy33 = _mm_sub_ps(iy3,jy3);
2029 dz33 = _mm_sub_ps(iz3,jz3);
2031 /* Calculate squared distance and things based on it */
2032 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2033 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2034 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2035 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2036 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2037 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2038 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2039 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2040 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2041 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2043 rinv11 = gmx_mm_invsqrt_ps(rsq11);
2044 rinv12 = gmx_mm_invsqrt_ps(rsq12);
2045 rinv13 = gmx_mm_invsqrt_ps(rsq13);
2046 rinv21 = gmx_mm_invsqrt_ps(rsq21);
2047 rinv22 = gmx_mm_invsqrt_ps(rsq22);
2048 rinv23 = gmx_mm_invsqrt_ps(rsq23);
2049 rinv31 = gmx_mm_invsqrt_ps(rsq31);
2050 rinv32 = gmx_mm_invsqrt_ps(rsq32);
2051 rinv33 = gmx_mm_invsqrt_ps(rsq33);
2053 rinvsq00 = gmx_mm_inv_ps(rsq00);
2055 fjx0 = _mm_setzero_ps();
2056 fjy0 = _mm_setzero_ps();
2057 fjz0 = _mm_setzero_ps();
2058 fjx1 = _mm_setzero_ps();
2059 fjy1 = _mm_setzero_ps();
2060 fjz1 = _mm_setzero_ps();
2061 fjx2 = _mm_setzero_ps();
2062 fjy2 = _mm_setzero_ps();
2063 fjz2 = _mm_setzero_ps();
2064 fjx3 = _mm_setzero_ps();
2065 fjy3 = _mm_setzero_ps();
2066 fjz3 = _mm_setzero_ps();
2068 /**************************
2069 * CALCULATE INTERACTIONS *
2070 **************************/
2072 /* LENNARD-JONES DISPERSION/REPULSION */
2074 rinvsix = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2075 fvdw = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2079 fscal = _mm_andnot_ps(dummy_mask,fscal);
2081 /* Calculate temporary vectorial force */
2082 tx = _mm_mul_ps(fscal,dx00);
2083 ty = _mm_mul_ps(fscal,dy00);
2084 tz = _mm_mul_ps(fscal,dz00);
2086 /* Update vectorial force */
2087 fix0 = _mm_add_ps(fix0,tx);
2088 fiy0 = _mm_add_ps(fiy0,ty);
2089 fiz0 = _mm_add_ps(fiz0,tz);
2091 fjx0 = _mm_add_ps(fjx0,tx);
2092 fjy0 = _mm_add_ps(fjy0,ty);
2093 fjz0 = _mm_add_ps(fjz0,tz);
2095 /**************************
2096 * CALCULATE INTERACTIONS *
2097 **************************/
2099 r11 = _mm_mul_ps(rsq11,rinv11);
2100 r11 = _mm_andnot_ps(dummy_mask,r11);
2102 /* Calculate table index by multiplying r with table scale and truncate to integer */
2103 rt = _mm_mul_ps(r11,vftabscale);
2104 vfitab = _mm_cvttps_epi32(rt);
2105 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2106 vfitab = _mm_slli_epi32(vfitab,2);
2108 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2109 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2110 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2111 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2112 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2113 _MM_TRANSPOSE4_PS(Y,F,G,H);
2114 Heps = _mm_mul_ps(vfeps,H);
2115 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2116 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2117 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2121 fscal = _mm_andnot_ps(dummy_mask,fscal);
2123 /* Calculate temporary vectorial force */
2124 tx = _mm_mul_ps(fscal,dx11);
2125 ty = _mm_mul_ps(fscal,dy11);
2126 tz = _mm_mul_ps(fscal,dz11);
2128 /* Update vectorial force */
2129 fix1 = _mm_add_ps(fix1,tx);
2130 fiy1 = _mm_add_ps(fiy1,ty);
2131 fiz1 = _mm_add_ps(fiz1,tz);
2133 fjx1 = _mm_add_ps(fjx1,tx);
2134 fjy1 = _mm_add_ps(fjy1,ty);
2135 fjz1 = _mm_add_ps(fjz1,tz);
2137 /**************************
2138 * CALCULATE INTERACTIONS *
2139 **************************/
2141 r12 = _mm_mul_ps(rsq12,rinv12);
2142 r12 = _mm_andnot_ps(dummy_mask,r12);
2144 /* Calculate table index by multiplying r with table scale and truncate to integer */
2145 rt = _mm_mul_ps(r12,vftabscale);
2146 vfitab = _mm_cvttps_epi32(rt);
2147 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2148 vfitab = _mm_slli_epi32(vfitab,2);
2150 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2151 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2152 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2153 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2154 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2155 _MM_TRANSPOSE4_PS(Y,F,G,H);
2156 Heps = _mm_mul_ps(vfeps,H);
2157 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2158 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2159 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2163 fscal = _mm_andnot_ps(dummy_mask,fscal);
2165 /* Calculate temporary vectorial force */
2166 tx = _mm_mul_ps(fscal,dx12);
2167 ty = _mm_mul_ps(fscal,dy12);
2168 tz = _mm_mul_ps(fscal,dz12);
2170 /* Update vectorial force */
2171 fix1 = _mm_add_ps(fix1,tx);
2172 fiy1 = _mm_add_ps(fiy1,ty);
2173 fiz1 = _mm_add_ps(fiz1,tz);
2175 fjx2 = _mm_add_ps(fjx2,tx);
2176 fjy2 = _mm_add_ps(fjy2,ty);
2177 fjz2 = _mm_add_ps(fjz2,tz);
2179 /**************************
2180 * CALCULATE INTERACTIONS *
2181 **************************/
2183 r13 = _mm_mul_ps(rsq13,rinv13);
2184 r13 = _mm_andnot_ps(dummy_mask,r13);
2186 /* Calculate table index by multiplying r with table scale and truncate to integer */
2187 rt = _mm_mul_ps(r13,vftabscale);
2188 vfitab = _mm_cvttps_epi32(rt);
2189 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2190 vfitab = _mm_slli_epi32(vfitab,2);
2192 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2193 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2194 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2195 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2196 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2197 _MM_TRANSPOSE4_PS(Y,F,G,H);
2198 Heps = _mm_mul_ps(vfeps,H);
2199 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2200 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2201 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2205 fscal = _mm_andnot_ps(dummy_mask,fscal);
2207 /* Calculate temporary vectorial force */
2208 tx = _mm_mul_ps(fscal,dx13);
2209 ty = _mm_mul_ps(fscal,dy13);
2210 tz = _mm_mul_ps(fscal,dz13);
2212 /* Update vectorial force */
2213 fix1 = _mm_add_ps(fix1,tx);
2214 fiy1 = _mm_add_ps(fiy1,ty);
2215 fiz1 = _mm_add_ps(fiz1,tz);
2217 fjx3 = _mm_add_ps(fjx3,tx);
2218 fjy3 = _mm_add_ps(fjy3,ty);
2219 fjz3 = _mm_add_ps(fjz3,tz);
2221 /**************************
2222 * CALCULATE INTERACTIONS *
2223 **************************/
2225 r21 = _mm_mul_ps(rsq21,rinv21);
2226 r21 = _mm_andnot_ps(dummy_mask,r21);
2228 /* Calculate table index by multiplying r with table scale and truncate to integer */
2229 rt = _mm_mul_ps(r21,vftabscale);
2230 vfitab = _mm_cvttps_epi32(rt);
2231 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2232 vfitab = _mm_slli_epi32(vfitab,2);
2234 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2235 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2236 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2237 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2238 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2239 _MM_TRANSPOSE4_PS(Y,F,G,H);
2240 Heps = _mm_mul_ps(vfeps,H);
2241 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2242 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2243 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2247 fscal = _mm_andnot_ps(dummy_mask,fscal);
2249 /* Calculate temporary vectorial force */
2250 tx = _mm_mul_ps(fscal,dx21);
2251 ty = _mm_mul_ps(fscal,dy21);
2252 tz = _mm_mul_ps(fscal,dz21);
2254 /* Update vectorial force */
2255 fix2 = _mm_add_ps(fix2,tx);
2256 fiy2 = _mm_add_ps(fiy2,ty);
2257 fiz2 = _mm_add_ps(fiz2,tz);
2259 fjx1 = _mm_add_ps(fjx1,tx);
2260 fjy1 = _mm_add_ps(fjy1,ty);
2261 fjz1 = _mm_add_ps(fjz1,tz);
2263 /**************************
2264 * CALCULATE INTERACTIONS *
2265 **************************/
2267 r22 = _mm_mul_ps(rsq22,rinv22);
2268 r22 = _mm_andnot_ps(dummy_mask,r22);
2270 /* Calculate table index by multiplying r with table scale and truncate to integer */
2271 rt = _mm_mul_ps(r22,vftabscale);
2272 vfitab = _mm_cvttps_epi32(rt);
2273 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2274 vfitab = _mm_slli_epi32(vfitab,2);
2276 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2277 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2278 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2279 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2280 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2281 _MM_TRANSPOSE4_PS(Y,F,G,H);
2282 Heps = _mm_mul_ps(vfeps,H);
2283 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2284 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2285 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2289 fscal = _mm_andnot_ps(dummy_mask,fscal);
2291 /* Calculate temporary vectorial force */
2292 tx = _mm_mul_ps(fscal,dx22);
2293 ty = _mm_mul_ps(fscal,dy22);
2294 tz = _mm_mul_ps(fscal,dz22);
2296 /* Update vectorial force */
2297 fix2 = _mm_add_ps(fix2,tx);
2298 fiy2 = _mm_add_ps(fiy2,ty);
2299 fiz2 = _mm_add_ps(fiz2,tz);
2301 fjx2 = _mm_add_ps(fjx2,tx);
2302 fjy2 = _mm_add_ps(fjy2,ty);
2303 fjz2 = _mm_add_ps(fjz2,tz);
2305 /**************************
2306 * CALCULATE INTERACTIONS *
2307 **************************/
2309 r23 = _mm_mul_ps(rsq23,rinv23);
2310 r23 = _mm_andnot_ps(dummy_mask,r23);
2312 /* Calculate table index by multiplying r with table scale and truncate to integer */
2313 rt = _mm_mul_ps(r23,vftabscale);
2314 vfitab = _mm_cvttps_epi32(rt);
2315 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2316 vfitab = _mm_slli_epi32(vfitab,2);
2318 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2319 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2320 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2321 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2322 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2323 _MM_TRANSPOSE4_PS(Y,F,G,H);
2324 Heps = _mm_mul_ps(vfeps,H);
2325 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2326 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2327 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2331 fscal = _mm_andnot_ps(dummy_mask,fscal);
2333 /* Calculate temporary vectorial force */
2334 tx = _mm_mul_ps(fscal,dx23);
2335 ty = _mm_mul_ps(fscal,dy23);
2336 tz = _mm_mul_ps(fscal,dz23);
2338 /* Update vectorial force */
2339 fix2 = _mm_add_ps(fix2,tx);
2340 fiy2 = _mm_add_ps(fiy2,ty);
2341 fiz2 = _mm_add_ps(fiz2,tz);
2343 fjx3 = _mm_add_ps(fjx3,tx);
2344 fjy3 = _mm_add_ps(fjy3,ty);
2345 fjz3 = _mm_add_ps(fjz3,tz);
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 r31 = _mm_mul_ps(rsq31,rinv31);
2352 r31 = _mm_andnot_ps(dummy_mask,r31);
2354 /* Calculate table index by multiplying r with table scale and truncate to integer */
2355 rt = _mm_mul_ps(r31,vftabscale);
2356 vfitab = _mm_cvttps_epi32(rt);
2357 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2358 vfitab = _mm_slli_epi32(vfitab,2);
2360 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2361 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2362 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2363 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2364 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2365 _MM_TRANSPOSE4_PS(Y,F,G,H);
2366 Heps = _mm_mul_ps(vfeps,H);
2367 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2368 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2369 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2373 fscal = _mm_andnot_ps(dummy_mask,fscal);
2375 /* Calculate temporary vectorial force */
2376 tx = _mm_mul_ps(fscal,dx31);
2377 ty = _mm_mul_ps(fscal,dy31);
2378 tz = _mm_mul_ps(fscal,dz31);
2380 /* Update vectorial force */
2381 fix3 = _mm_add_ps(fix3,tx);
2382 fiy3 = _mm_add_ps(fiy3,ty);
2383 fiz3 = _mm_add_ps(fiz3,tz);
2385 fjx1 = _mm_add_ps(fjx1,tx);
2386 fjy1 = _mm_add_ps(fjy1,ty);
2387 fjz1 = _mm_add_ps(fjz1,tz);
2389 /**************************
2390 * CALCULATE INTERACTIONS *
2391 **************************/
2393 r32 = _mm_mul_ps(rsq32,rinv32);
2394 r32 = _mm_andnot_ps(dummy_mask,r32);
2396 /* Calculate table index by multiplying r with table scale and truncate to integer */
2397 rt = _mm_mul_ps(r32,vftabscale);
2398 vfitab = _mm_cvttps_epi32(rt);
2399 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2400 vfitab = _mm_slli_epi32(vfitab,2);
2402 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2403 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2404 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2405 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2406 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2407 _MM_TRANSPOSE4_PS(Y,F,G,H);
2408 Heps = _mm_mul_ps(vfeps,H);
2409 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2410 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2411 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2415 fscal = _mm_andnot_ps(dummy_mask,fscal);
2417 /* Calculate temporary vectorial force */
2418 tx = _mm_mul_ps(fscal,dx32);
2419 ty = _mm_mul_ps(fscal,dy32);
2420 tz = _mm_mul_ps(fscal,dz32);
2422 /* Update vectorial force */
2423 fix3 = _mm_add_ps(fix3,tx);
2424 fiy3 = _mm_add_ps(fiy3,ty);
2425 fiz3 = _mm_add_ps(fiz3,tz);
2427 fjx2 = _mm_add_ps(fjx2,tx);
2428 fjy2 = _mm_add_ps(fjy2,ty);
2429 fjz2 = _mm_add_ps(fjz2,tz);
2431 /**************************
2432 * CALCULATE INTERACTIONS *
2433 **************************/
2435 r33 = _mm_mul_ps(rsq33,rinv33);
2436 r33 = _mm_andnot_ps(dummy_mask,r33);
2438 /* Calculate table index by multiplying r with table scale and truncate to integer */
2439 rt = _mm_mul_ps(r33,vftabscale);
2440 vfitab = _mm_cvttps_epi32(rt);
2441 vfeps = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2442 vfitab = _mm_slli_epi32(vfitab,2);
2444 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2445 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2446 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2447 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2448 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2449 _MM_TRANSPOSE4_PS(Y,F,G,H);
2450 Heps = _mm_mul_ps(vfeps,H);
2451 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2452 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2453 felec = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2457 fscal = _mm_andnot_ps(dummy_mask,fscal);
2459 /* Calculate temporary vectorial force */
2460 tx = _mm_mul_ps(fscal,dx33);
2461 ty = _mm_mul_ps(fscal,dy33);
2462 tz = _mm_mul_ps(fscal,dz33);
2464 /* Update vectorial force */
2465 fix3 = _mm_add_ps(fix3,tx);
2466 fiy3 = _mm_add_ps(fiy3,ty);
2467 fiz3 = _mm_add_ps(fiz3,tz);
2469 fjx3 = _mm_add_ps(fjx3,tx);
2470 fjy3 = _mm_add_ps(fjy3,ty);
2471 fjz3 = _mm_add_ps(fjz3,tz);
2473 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2474 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2475 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2476 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2478 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2479 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2480 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2482 /* Inner loop uses 390 flops */
2485 /* End of innermost loop */
2487 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2488 f+i_coord_offset,fshift+i_shift_offset);
2490 /* Increment number of inner iterations */
2491 inneriter += j_index_end - j_index_start;
2493 /* Outer loop uses 24 flops */
2496 /* Increment number of outer iterations */
2499 /* Update outer/inner flops */
2501 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);