2 * Note: this file was generated by the Gromacs sse2_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_sse2_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
63 real shX,shY,shZ,rcutoff_scalar;
64 real *shiftvec,*fshift,*x,*f;
65 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
95 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
98 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
99 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
101 __m128i ifour = _mm_set1_epi32(4);
102 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
104 __m128 dummy_mask,cutoff_mask;
105 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106 __m128 one = _mm_set1_ps(1.0);
107 __m128 two = _mm_set1_ps(2.0);
113 jindex = nlist->jindex;
115 shiftidx = nlist->shift;
117 shiftvec = fr->shift_vec[0];
118 fshift = fr->fshift[0];
119 facel = _mm_set1_ps(fr->epsfac);
120 charge = mdatoms->chargeA;
121 nvdwtype = fr->ntype;
123 vdwtype = mdatoms->typeA;
125 vftab = kernel_data->table_vdw->data;
126 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
128 /* Setup water-specific parameters */
129 inr = nlist->iinr[0];
130 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
133 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
135 jq1 = _mm_set1_ps(charge[inr+1]);
136 jq2 = _mm_set1_ps(charge[inr+2]);
137 jq3 = _mm_set1_ps(charge[inr+3]);
138 vdwjidx0A = 2*vdwtype[inr+0];
139 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141 qq11 = _mm_mul_ps(iq1,jq1);
142 qq12 = _mm_mul_ps(iq1,jq2);
143 qq13 = _mm_mul_ps(iq1,jq3);
144 qq21 = _mm_mul_ps(iq2,jq1);
145 qq22 = _mm_mul_ps(iq2,jq2);
146 qq23 = _mm_mul_ps(iq2,jq3);
147 qq31 = _mm_mul_ps(iq3,jq1);
148 qq32 = _mm_mul_ps(iq3,jq2);
149 qq33 = _mm_mul_ps(iq3,jq3);
151 /* Avoid stupid compiler warnings */
152 jnrA = jnrB = jnrC = jnrD = 0;
161 /* Start outer loop over neighborlists */
162 for(iidx=0; iidx<nri; iidx++)
164 /* Load shift vector for this list */
165 i_shift_offset = DIM*shiftidx[iidx];
166 shX = shiftvec[i_shift_offset+XX];
167 shY = shiftvec[i_shift_offset+YY];
168 shZ = shiftvec[i_shift_offset+ZZ];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
180 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
181 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
182 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
183 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
184 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
185 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
186 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
187 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
188 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
189 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
190 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
192 fix0 = _mm_setzero_ps();
193 fiy0 = _mm_setzero_ps();
194 fiz0 = _mm_setzero_ps();
195 fix1 = _mm_setzero_ps();
196 fiy1 = _mm_setzero_ps();
197 fiz1 = _mm_setzero_ps();
198 fix2 = _mm_setzero_ps();
199 fiy2 = _mm_setzero_ps();
200 fiz2 = _mm_setzero_ps();
201 fix3 = _mm_setzero_ps();
202 fiy3 = _mm_setzero_ps();
203 fiz3 = _mm_setzero_ps();
205 /* Reset potential sums */
206 velecsum = _mm_setzero_ps();
207 vvdwsum = _mm_setzero_ps();
209 /* Start inner kernel loop */
210 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
213 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
224 /* load j atom coordinates */
225 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226 x+j_coord_offsetC,x+j_coord_offsetD,
227 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
228 &jy2,&jz2,&jx3,&jy3,&jz3);
230 /* Calculate displacement vector */
231 dx00 = _mm_sub_ps(ix0,jx0);
232 dy00 = _mm_sub_ps(iy0,jy0);
233 dz00 = _mm_sub_ps(iz0,jz0);
234 dx11 = _mm_sub_ps(ix1,jx1);
235 dy11 = _mm_sub_ps(iy1,jy1);
236 dz11 = _mm_sub_ps(iz1,jz1);
237 dx12 = _mm_sub_ps(ix1,jx2);
238 dy12 = _mm_sub_ps(iy1,jy2);
239 dz12 = _mm_sub_ps(iz1,jz2);
240 dx13 = _mm_sub_ps(ix1,jx3);
241 dy13 = _mm_sub_ps(iy1,jy3);
242 dz13 = _mm_sub_ps(iz1,jz3);
243 dx21 = _mm_sub_ps(ix2,jx1);
244 dy21 = _mm_sub_ps(iy2,jy1);
245 dz21 = _mm_sub_ps(iz2,jz1);
246 dx22 = _mm_sub_ps(ix2,jx2);
247 dy22 = _mm_sub_ps(iy2,jy2);
248 dz22 = _mm_sub_ps(iz2,jz2);
249 dx23 = _mm_sub_ps(ix2,jx3);
250 dy23 = _mm_sub_ps(iy2,jy3);
251 dz23 = _mm_sub_ps(iz2,jz3);
252 dx31 = _mm_sub_ps(ix3,jx1);
253 dy31 = _mm_sub_ps(iy3,jy1);
254 dz31 = _mm_sub_ps(iz3,jz1);
255 dx32 = _mm_sub_ps(ix3,jx2);
256 dy32 = _mm_sub_ps(iy3,jy2);
257 dz32 = _mm_sub_ps(iz3,jz2);
258 dx33 = _mm_sub_ps(ix3,jx3);
259 dy33 = _mm_sub_ps(iy3,jy3);
260 dz33 = _mm_sub_ps(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
264 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
265 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
266 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
267 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
268 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
269 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
270 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
271 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
272 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
274 rinv00 = gmx_mm_invsqrt_ps(rsq00);
275 rinv11 = gmx_mm_invsqrt_ps(rsq11);
276 rinv12 = gmx_mm_invsqrt_ps(rsq12);
277 rinv13 = gmx_mm_invsqrt_ps(rsq13);
278 rinv21 = gmx_mm_invsqrt_ps(rsq21);
279 rinv22 = gmx_mm_invsqrt_ps(rsq22);
280 rinv23 = gmx_mm_invsqrt_ps(rsq23);
281 rinv31 = gmx_mm_invsqrt_ps(rsq31);
282 rinv32 = gmx_mm_invsqrt_ps(rsq32);
283 rinv33 = gmx_mm_invsqrt_ps(rsq33);
285 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
286 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
287 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
288 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
289 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
290 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
291 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
292 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
293 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
295 fjx0 = _mm_setzero_ps();
296 fjy0 = _mm_setzero_ps();
297 fjz0 = _mm_setzero_ps();
298 fjx1 = _mm_setzero_ps();
299 fjy1 = _mm_setzero_ps();
300 fjz1 = _mm_setzero_ps();
301 fjx2 = _mm_setzero_ps();
302 fjy2 = _mm_setzero_ps();
303 fjz2 = _mm_setzero_ps();
304 fjx3 = _mm_setzero_ps();
305 fjy3 = _mm_setzero_ps();
306 fjz3 = _mm_setzero_ps();
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 r00 = _mm_mul_ps(rsq00,rinv00);
314 /* Calculate table index by multiplying r with table scale and truncate to integer */
315 rt = _mm_mul_ps(r00,vftabscale);
316 vfitab = _mm_cvttps_epi32(rt);
317 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
318 vfitab = _mm_slli_epi32(vfitab,3);
320 /* CUBIC SPLINE TABLE DISPERSION */
321 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
322 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
323 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
324 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
325 _MM_TRANSPOSE4_PS(Y,F,G,H);
326 Heps = _mm_mul_ps(vfeps,H);
327 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
328 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
329 vvdw6 = _mm_mul_ps(c6_00,VV);
330 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
331 fvdw6 = _mm_mul_ps(c6_00,FF);
333 /* CUBIC SPLINE TABLE REPULSION */
334 vfitab = _mm_add_epi32(vfitab,ifour);
335 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
336 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
337 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
338 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
339 _MM_TRANSPOSE4_PS(Y,F,G,H);
340 Heps = _mm_mul_ps(vfeps,H);
341 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
342 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
343 vvdw12 = _mm_mul_ps(c12_00,VV);
344 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
345 fvdw12 = _mm_mul_ps(c12_00,FF);
346 vvdw = _mm_add_ps(vvdw12,vvdw6);
347 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
354 /* Calculate temporary vectorial force */
355 tx = _mm_mul_ps(fscal,dx00);
356 ty = _mm_mul_ps(fscal,dy00);
357 tz = _mm_mul_ps(fscal,dz00);
359 /* Update vectorial force */
360 fix0 = _mm_add_ps(fix0,tx);
361 fiy0 = _mm_add_ps(fiy0,ty);
362 fiz0 = _mm_add_ps(fiz0,tz);
364 fjx0 = _mm_add_ps(fjx0,tx);
365 fjy0 = _mm_add_ps(fjy0,ty);
366 fjz0 = _mm_add_ps(fjz0,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 /* COULOMB ELECTROSTATICS */
373 velec = _mm_mul_ps(qq11,rinv11);
374 felec = _mm_mul_ps(velec,rinvsq11);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum = _mm_add_ps(velecsum,velec);
381 /* Calculate temporary vectorial force */
382 tx = _mm_mul_ps(fscal,dx11);
383 ty = _mm_mul_ps(fscal,dy11);
384 tz = _mm_mul_ps(fscal,dz11);
386 /* Update vectorial force */
387 fix1 = _mm_add_ps(fix1,tx);
388 fiy1 = _mm_add_ps(fiy1,ty);
389 fiz1 = _mm_add_ps(fiz1,tz);
391 fjx1 = _mm_add_ps(fjx1,tx);
392 fjy1 = _mm_add_ps(fjy1,ty);
393 fjz1 = _mm_add_ps(fjz1,tz);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 /* COULOMB ELECTROSTATICS */
400 velec = _mm_mul_ps(qq12,rinv12);
401 felec = _mm_mul_ps(velec,rinvsq12);
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm_add_ps(velecsum,velec);
408 /* Calculate temporary vectorial force */
409 tx = _mm_mul_ps(fscal,dx12);
410 ty = _mm_mul_ps(fscal,dy12);
411 tz = _mm_mul_ps(fscal,dz12);
413 /* Update vectorial force */
414 fix1 = _mm_add_ps(fix1,tx);
415 fiy1 = _mm_add_ps(fiy1,ty);
416 fiz1 = _mm_add_ps(fiz1,tz);
418 fjx2 = _mm_add_ps(fjx2,tx);
419 fjy2 = _mm_add_ps(fjy2,ty);
420 fjz2 = _mm_add_ps(fjz2,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 /* COULOMB ELECTROSTATICS */
427 velec = _mm_mul_ps(qq13,rinv13);
428 felec = _mm_mul_ps(velec,rinvsq13);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm_add_ps(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm_mul_ps(fscal,dx13);
437 ty = _mm_mul_ps(fscal,dy13);
438 tz = _mm_mul_ps(fscal,dz13);
440 /* Update vectorial force */
441 fix1 = _mm_add_ps(fix1,tx);
442 fiy1 = _mm_add_ps(fiy1,ty);
443 fiz1 = _mm_add_ps(fiz1,tz);
445 fjx3 = _mm_add_ps(fjx3,tx);
446 fjy3 = _mm_add_ps(fjy3,ty);
447 fjz3 = _mm_add_ps(fjz3,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 /* COULOMB ELECTROSTATICS */
454 velec = _mm_mul_ps(qq21,rinv21);
455 felec = _mm_mul_ps(velec,rinvsq21);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum = _mm_add_ps(velecsum,velec);
462 /* Calculate temporary vectorial force */
463 tx = _mm_mul_ps(fscal,dx21);
464 ty = _mm_mul_ps(fscal,dy21);
465 tz = _mm_mul_ps(fscal,dz21);
467 /* Update vectorial force */
468 fix2 = _mm_add_ps(fix2,tx);
469 fiy2 = _mm_add_ps(fiy2,ty);
470 fiz2 = _mm_add_ps(fiz2,tz);
472 fjx1 = _mm_add_ps(fjx1,tx);
473 fjy1 = _mm_add_ps(fjy1,ty);
474 fjz1 = _mm_add_ps(fjz1,tz);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 /* COULOMB ELECTROSTATICS */
481 velec = _mm_mul_ps(qq22,rinv22);
482 felec = _mm_mul_ps(velec,rinvsq22);
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm_add_ps(velecsum,velec);
489 /* Calculate temporary vectorial force */
490 tx = _mm_mul_ps(fscal,dx22);
491 ty = _mm_mul_ps(fscal,dy22);
492 tz = _mm_mul_ps(fscal,dz22);
494 /* Update vectorial force */
495 fix2 = _mm_add_ps(fix2,tx);
496 fiy2 = _mm_add_ps(fiy2,ty);
497 fiz2 = _mm_add_ps(fiz2,tz);
499 fjx2 = _mm_add_ps(fjx2,tx);
500 fjy2 = _mm_add_ps(fjy2,ty);
501 fjz2 = _mm_add_ps(fjz2,tz);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 /* COULOMB ELECTROSTATICS */
508 velec = _mm_mul_ps(qq23,rinv23);
509 felec = _mm_mul_ps(velec,rinvsq23);
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velecsum = _mm_add_ps(velecsum,velec);
516 /* Calculate temporary vectorial force */
517 tx = _mm_mul_ps(fscal,dx23);
518 ty = _mm_mul_ps(fscal,dy23);
519 tz = _mm_mul_ps(fscal,dz23);
521 /* Update vectorial force */
522 fix2 = _mm_add_ps(fix2,tx);
523 fiy2 = _mm_add_ps(fiy2,ty);
524 fiz2 = _mm_add_ps(fiz2,tz);
526 fjx3 = _mm_add_ps(fjx3,tx);
527 fjy3 = _mm_add_ps(fjy3,ty);
528 fjz3 = _mm_add_ps(fjz3,tz);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 /* COULOMB ELECTROSTATICS */
535 velec = _mm_mul_ps(qq31,rinv31);
536 felec = _mm_mul_ps(velec,rinvsq31);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum = _mm_add_ps(velecsum,velec);
543 /* Calculate temporary vectorial force */
544 tx = _mm_mul_ps(fscal,dx31);
545 ty = _mm_mul_ps(fscal,dy31);
546 tz = _mm_mul_ps(fscal,dz31);
548 /* Update vectorial force */
549 fix3 = _mm_add_ps(fix3,tx);
550 fiy3 = _mm_add_ps(fiy3,ty);
551 fiz3 = _mm_add_ps(fiz3,tz);
553 fjx1 = _mm_add_ps(fjx1,tx);
554 fjy1 = _mm_add_ps(fjy1,ty);
555 fjz1 = _mm_add_ps(fjz1,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 /* COULOMB ELECTROSTATICS */
562 velec = _mm_mul_ps(qq32,rinv32);
563 felec = _mm_mul_ps(velec,rinvsq32);
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum = _mm_add_ps(velecsum,velec);
570 /* Calculate temporary vectorial force */
571 tx = _mm_mul_ps(fscal,dx32);
572 ty = _mm_mul_ps(fscal,dy32);
573 tz = _mm_mul_ps(fscal,dz32);
575 /* Update vectorial force */
576 fix3 = _mm_add_ps(fix3,tx);
577 fiy3 = _mm_add_ps(fiy3,ty);
578 fiz3 = _mm_add_ps(fiz3,tz);
580 fjx2 = _mm_add_ps(fjx2,tx);
581 fjy2 = _mm_add_ps(fjy2,ty);
582 fjz2 = _mm_add_ps(fjz2,tz);
584 /**************************
585 * CALCULATE INTERACTIONS *
586 **************************/
588 /* COULOMB ELECTROSTATICS */
589 velec = _mm_mul_ps(qq33,rinv33);
590 felec = _mm_mul_ps(velec,rinvsq33);
592 /* Update potential sum for this i atom from the interaction with this j atom. */
593 velecsum = _mm_add_ps(velecsum,velec);
597 /* Calculate temporary vectorial force */
598 tx = _mm_mul_ps(fscal,dx33);
599 ty = _mm_mul_ps(fscal,dy33);
600 tz = _mm_mul_ps(fscal,dz33);
602 /* Update vectorial force */
603 fix3 = _mm_add_ps(fix3,tx);
604 fiy3 = _mm_add_ps(fiy3,ty);
605 fiz3 = _mm_add_ps(fiz3,tz);
607 fjx3 = _mm_add_ps(fjx3,tx);
608 fjy3 = _mm_add_ps(fjy3,ty);
609 fjz3 = _mm_add_ps(fjz3,tz);
611 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
612 f+j_coord_offsetC,f+j_coord_offsetD,
613 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
614 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
616 /* Inner loop uses 311 flops */
622 /* Get j neighbor index, and coordinate index */
628 /* Sign of each element will be negative for non-real atoms.
629 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
630 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
632 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
633 jnrA = (jnrA>=0) ? jnrA : 0;
634 jnrB = (jnrB>=0) ? jnrB : 0;
635 jnrC = (jnrC>=0) ? jnrC : 0;
636 jnrD = (jnrD>=0) ? jnrD : 0;
638 j_coord_offsetA = DIM*jnrA;
639 j_coord_offsetB = DIM*jnrB;
640 j_coord_offsetC = DIM*jnrC;
641 j_coord_offsetD = DIM*jnrD;
643 /* load j atom coordinates */
644 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
645 x+j_coord_offsetC,x+j_coord_offsetD,
646 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
647 &jy2,&jz2,&jx3,&jy3,&jz3);
649 /* Calculate displacement vector */
650 dx00 = _mm_sub_ps(ix0,jx0);
651 dy00 = _mm_sub_ps(iy0,jy0);
652 dz00 = _mm_sub_ps(iz0,jz0);
653 dx11 = _mm_sub_ps(ix1,jx1);
654 dy11 = _mm_sub_ps(iy1,jy1);
655 dz11 = _mm_sub_ps(iz1,jz1);
656 dx12 = _mm_sub_ps(ix1,jx2);
657 dy12 = _mm_sub_ps(iy1,jy2);
658 dz12 = _mm_sub_ps(iz1,jz2);
659 dx13 = _mm_sub_ps(ix1,jx3);
660 dy13 = _mm_sub_ps(iy1,jy3);
661 dz13 = _mm_sub_ps(iz1,jz3);
662 dx21 = _mm_sub_ps(ix2,jx1);
663 dy21 = _mm_sub_ps(iy2,jy1);
664 dz21 = _mm_sub_ps(iz2,jz1);
665 dx22 = _mm_sub_ps(ix2,jx2);
666 dy22 = _mm_sub_ps(iy2,jy2);
667 dz22 = _mm_sub_ps(iz2,jz2);
668 dx23 = _mm_sub_ps(ix2,jx3);
669 dy23 = _mm_sub_ps(iy2,jy3);
670 dz23 = _mm_sub_ps(iz2,jz3);
671 dx31 = _mm_sub_ps(ix3,jx1);
672 dy31 = _mm_sub_ps(iy3,jy1);
673 dz31 = _mm_sub_ps(iz3,jz1);
674 dx32 = _mm_sub_ps(ix3,jx2);
675 dy32 = _mm_sub_ps(iy3,jy2);
676 dz32 = _mm_sub_ps(iz3,jz2);
677 dx33 = _mm_sub_ps(ix3,jx3);
678 dy33 = _mm_sub_ps(iy3,jy3);
679 dz33 = _mm_sub_ps(iz3,jz3);
681 /* Calculate squared distance and things based on it */
682 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
683 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
684 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
685 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
686 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
687 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
688 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
689 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
690 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
691 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
693 rinv00 = gmx_mm_invsqrt_ps(rsq00);
694 rinv11 = gmx_mm_invsqrt_ps(rsq11);
695 rinv12 = gmx_mm_invsqrt_ps(rsq12);
696 rinv13 = gmx_mm_invsqrt_ps(rsq13);
697 rinv21 = gmx_mm_invsqrt_ps(rsq21);
698 rinv22 = gmx_mm_invsqrt_ps(rsq22);
699 rinv23 = gmx_mm_invsqrt_ps(rsq23);
700 rinv31 = gmx_mm_invsqrt_ps(rsq31);
701 rinv32 = gmx_mm_invsqrt_ps(rsq32);
702 rinv33 = gmx_mm_invsqrt_ps(rsq33);
704 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
705 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
706 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
707 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
708 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
709 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
710 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
711 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
712 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
714 fjx0 = _mm_setzero_ps();
715 fjy0 = _mm_setzero_ps();
716 fjz0 = _mm_setzero_ps();
717 fjx1 = _mm_setzero_ps();
718 fjy1 = _mm_setzero_ps();
719 fjz1 = _mm_setzero_ps();
720 fjx2 = _mm_setzero_ps();
721 fjy2 = _mm_setzero_ps();
722 fjz2 = _mm_setzero_ps();
723 fjx3 = _mm_setzero_ps();
724 fjy3 = _mm_setzero_ps();
725 fjz3 = _mm_setzero_ps();
727 /**************************
728 * CALCULATE INTERACTIONS *
729 **************************/
731 r00 = _mm_mul_ps(rsq00,rinv00);
732 r00 = _mm_andnot_ps(dummy_mask,r00);
734 /* Calculate table index by multiplying r with table scale and truncate to integer */
735 rt = _mm_mul_ps(r00,vftabscale);
736 vfitab = _mm_cvttps_epi32(rt);
737 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
738 vfitab = _mm_slli_epi32(vfitab,3);
740 /* CUBIC SPLINE TABLE DISPERSION */
741 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
742 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
743 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
744 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
745 _MM_TRANSPOSE4_PS(Y,F,G,H);
746 Heps = _mm_mul_ps(vfeps,H);
747 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
748 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
749 vvdw6 = _mm_mul_ps(c6_00,VV);
750 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
751 fvdw6 = _mm_mul_ps(c6_00,FF);
753 /* CUBIC SPLINE TABLE REPULSION */
754 vfitab = _mm_add_epi32(vfitab,ifour);
755 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
756 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
757 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
758 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
759 _MM_TRANSPOSE4_PS(Y,F,G,H);
760 Heps = _mm_mul_ps(vfeps,H);
761 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
762 VV = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
763 vvdw12 = _mm_mul_ps(c12_00,VV);
764 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
765 fvdw12 = _mm_mul_ps(c12_00,FF);
766 vvdw = _mm_add_ps(vvdw12,vvdw6);
767 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
769 /* Update potential sum for this i atom from the interaction with this j atom. */
770 vvdw = _mm_andnot_ps(dummy_mask,vvdw);
771 vvdwsum = _mm_add_ps(vvdwsum,vvdw);
775 fscal = _mm_andnot_ps(dummy_mask,fscal);
777 /* Calculate temporary vectorial force */
778 tx = _mm_mul_ps(fscal,dx00);
779 ty = _mm_mul_ps(fscal,dy00);
780 tz = _mm_mul_ps(fscal,dz00);
782 /* Update vectorial force */
783 fix0 = _mm_add_ps(fix0,tx);
784 fiy0 = _mm_add_ps(fiy0,ty);
785 fiz0 = _mm_add_ps(fiz0,tz);
787 fjx0 = _mm_add_ps(fjx0,tx);
788 fjy0 = _mm_add_ps(fjy0,ty);
789 fjz0 = _mm_add_ps(fjz0,tz);
791 /**************************
792 * CALCULATE INTERACTIONS *
793 **************************/
795 /* COULOMB ELECTROSTATICS */
796 velec = _mm_mul_ps(qq11,rinv11);
797 felec = _mm_mul_ps(velec,rinvsq11);
799 /* Update potential sum for this i atom from the interaction with this j atom. */
800 velec = _mm_andnot_ps(dummy_mask,velec);
801 velecsum = _mm_add_ps(velecsum,velec);
805 fscal = _mm_andnot_ps(dummy_mask,fscal);
807 /* Calculate temporary vectorial force */
808 tx = _mm_mul_ps(fscal,dx11);
809 ty = _mm_mul_ps(fscal,dy11);
810 tz = _mm_mul_ps(fscal,dz11);
812 /* Update vectorial force */
813 fix1 = _mm_add_ps(fix1,tx);
814 fiy1 = _mm_add_ps(fiy1,ty);
815 fiz1 = _mm_add_ps(fiz1,tz);
817 fjx1 = _mm_add_ps(fjx1,tx);
818 fjy1 = _mm_add_ps(fjy1,ty);
819 fjz1 = _mm_add_ps(fjz1,tz);
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
825 /* COULOMB ELECTROSTATICS */
826 velec = _mm_mul_ps(qq12,rinv12);
827 felec = _mm_mul_ps(velec,rinvsq12);
829 /* Update potential sum for this i atom from the interaction with this j atom. */
830 velec = _mm_andnot_ps(dummy_mask,velec);
831 velecsum = _mm_add_ps(velecsum,velec);
835 fscal = _mm_andnot_ps(dummy_mask,fscal);
837 /* Calculate temporary vectorial force */
838 tx = _mm_mul_ps(fscal,dx12);
839 ty = _mm_mul_ps(fscal,dy12);
840 tz = _mm_mul_ps(fscal,dz12);
842 /* Update vectorial force */
843 fix1 = _mm_add_ps(fix1,tx);
844 fiy1 = _mm_add_ps(fiy1,ty);
845 fiz1 = _mm_add_ps(fiz1,tz);
847 fjx2 = _mm_add_ps(fjx2,tx);
848 fjy2 = _mm_add_ps(fjy2,ty);
849 fjz2 = _mm_add_ps(fjz2,tz);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 /* COULOMB ELECTROSTATICS */
856 velec = _mm_mul_ps(qq13,rinv13);
857 felec = _mm_mul_ps(velec,rinvsq13);
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm_andnot_ps(dummy_mask,velec);
861 velecsum = _mm_add_ps(velecsum,velec);
865 fscal = _mm_andnot_ps(dummy_mask,fscal);
867 /* Calculate temporary vectorial force */
868 tx = _mm_mul_ps(fscal,dx13);
869 ty = _mm_mul_ps(fscal,dy13);
870 tz = _mm_mul_ps(fscal,dz13);
872 /* Update vectorial force */
873 fix1 = _mm_add_ps(fix1,tx);
874 fiy1 = _mm_add_ps(fiy1,ty);
875 fiz1 = _mm_add_ps(fiz1,tz);
877 fjx3 = _mm_add_ps(fjx3,tx);
878 fjy3 = _mm_add_ps(fjy3,ty);
879 fjz3 = _mm_add_ps(fjz3,tz);
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 /* COULOMB ELECTROSTATICS */
886 velec = _mm_mul_ps(qq21,rinv21);
887 felec = _mm_mul_ps(velec,rinvsq21);
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm_andnot_ps(dummy_mask,velec);
891 velecsum = _mm_add_ps(velecsum,velec);
895 fscal = _mm_andnot_ps(dummy_mask,fscal);
897 /* Calculate temporary vectorial force */
898 tx = _mm_mul_ps(fscal,dx21);
899 ty = _mm_mul_ps(fscal,dy21);
900 tz = _mm_mul_ps(fscal,dz21);
902 /* Update vectorial force */
903 fix2 = _mm_add_ps(fix2,tx);
904 fiy2 = _mm_add_ps(fiy2,ty);
905 fiz2 = _mm_add_ps(fiz2,tz);
907 fjx1 = _mm_add_ps(fjx1,tx);
908 fjy1 = _mm_add_ps(fjy1,ty);
909 fjz1 = _mm_add_ps(fjz1,tz);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 /* COULOMB ELECTROSTATICS */
916 velec = _mm_mul_ps(qq22,rinv22);
917 felec = _mm_mul_ps(velec,rinvsq22);
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm_andnot_ps(dummy_mask,velec);
921 velecsum = _mm_add_ps(velecsum,velec);
925 fscal = _mm_andnot_ps(dummy_mask,fscal);
927 /* Calculate temporary vectorial force */
928 tx = _mm_mul_ps(fscal,dx22);
929 ty = _mm_mul_ps(fscal,dy22);
930 tz = _mm_mul_ps(fscal,dz22);
932 /* Update vectorial force */
933 fix2 = _mm_add_ps(fix2,tx);
934 fiy2 = _mm_add_ps(fiy2,ty);
935 fiz2 = _mm_add_ps(fiz2,tz);
937 fjx2 = _mm_add_ps(fjx2,tx);
938 fjy2 = _mm_add_ps(fjy2,ty);
939 fjz2 = _mm_add_ps(fjz2,tz);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 /* COULOMB ELECTROSTATICS */
946 velec = _mm_mul_ps(qq23,rinv23);
947 felec = _mm_mul_ps(velec,rinvsq23);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm_andnot_ps(dummy_mask,velec);
951 velecsum = _mm_add_ps(velecsum,velec);
955 fscal = _mm_andnot_ps(dummy_mask,fscal);
957 /* Calculate temporary vectorial force */
958 tx = _mm_mul_ps(fscal,dx23);
959 ty = _mm_mul_ps(fscal,dy23);
960 tz = _mm_mul_ps(fscal,dz23);
962 /* Update vectorial force */
963 fix2 = _mm_add_ps(fix2,tx);
964 fiy2 = _mm_add_ps(fiy2,ty);
965 fiz2 = _mm_add_ps(fiz2,tz);
967 fjx3 = _mm_add_ps(fjx3,tx);
968 fjy3 = _mm_add_ps(fjy3,ty);
969 fjz3 = _mm_add_ps(fjz3,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 /* COULOMB ELECTROSTATICS */
976 velec = _mm_mul_ps(qq31,rinv31);
977 felec = _mm_mul_ps(velec,rinvsq31);
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm_andnot_ps(dummy_mask,velec);
981 velecsum = _mm_add_ps(velecsum,velec);
985 fscal = _mm_andnot_ps(dummy_mask,fscal);
987 /* Calculate temporary vectorial force */
988 tx = _mm_mul_ps(fscal,dx31);
989 ty = _mm_mul_ps(fscal,dy31);
990 tz = _mm_mul_ps(fscal,dz31);
992 /* Update vectorial force */
993 fix3 = _mm_add_ps(fix3,tx);
994 fiy3 = _mm_add_ps(fiy3,ty);
995 fiz3 = _mm_add_ps(fiz3,tz);
997 fjx1 = _mm_add_ps(fjx1,tx);
998 fjy1 = _mm_add_ps(fjy1,ty);
999 fjz1 = _mm_add_ps(fjz1,tz);
1001 /**************************
1002 * CALCULATE INTERACTIONS *
1003 **************************/
1005 /* COULOMB ELECTROSTATICS */
1006 velec = _mm_mul_ps(qq32,rinv32);
1007 felec = _mm_mul_ps(velec,rinvsq32);
1009 /* Update potential sum for this i atom from the interaction with this j atom. */
1010 velec = _mm_andnot_ps(dummy_mask,velec);
1011 velecsum = _mm_add_ps(velecsum,velec);
1015 fscal = _mm_andnot_ps(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm_mul_ps(fscal,dx32);
1019 ty = _mm_mul_ps(fscal,dy32);
1020 tz = _mm_mul_ps(fscal,dz32);
1022 /* Update vectorial force */
1023 fix3 = _mm_add_ps(fix3,tx);
1024 fiy3 = _mm_add_ps(fiy3,ty);
1025 fiz3 = _mm_add_ps(fiz3,tz);
1027 fjx2 = _mm_add_ps(fjx2,tx);
1028 fjy2 = _mm_add_ps(fjy2,ty);
1029 fjz2 = _mm_add_ps(fjz2,tz);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 /* COULOMB ELECTROSTATICS */
1036 velec = _mm_mul_ps(qq33,rinv33);
1037 felec = _mm_mul_ps(velec,rinvsq33);
1039 /* Update potential sum for this i atom from the interaction with this j atom. */
1040 velec = _mm_andnot_ps(dummy_mask,velec);
1041 velecsum = _mm_add_ps(velecsum,velec);
1045 fscal = _mm_andnot_ps(dummy_mask,fscal);
1047 /* Calculate temporary vectorial force */
1048 tx = _mm_mul_ps(fscal,dx33);
1049 ty = _mm_mul_ps(fscal,dy33);
1050 tz = _mm_mul_ps(fscal,dz33);
1052 /* Update vectorial force */
1053 fix3 = _mm_add_ps(fix3,tx);
1054 fiy3 = _mm_add_ps(fiy3,ty);
1055 fiz3 = _mm_add_ps(fiz3,tz);
1057 fjx3 = _mm_add_ps(fjx3,tx);
1058 fjy3 = _mm_add_ps(fjy3,ty);
1059 fjz3 = _mm_add_ps(fjz3,tz);
1061 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1062 f+j_coord_offsetC,f+j_coord_offsetD,
1063 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1064 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1066 /* Inner loop uses 312 flops */
1069 /* End of innermost loop */
1071 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1072 f+i_coord_offset,fshift+i_shift_offset);
1075 /* Update potential energies */
1076 gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1077 gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1079 /* Increment number of inner iterations */
1080 inneriter += j_index_end - j_index_start;
1082 /* Outer loop uses 38 flops */
1085 /* Increment number of outer iterations */
1088 /* Update outer/inner flops */
1090 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*312);
1093 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
1094 * Electrostatics interaction: Coulomb
1095 * VdW interaction: CubicSplineTable
1096 * Geometry: Water4-Water4
1097 * Calculate force/pot: Force
1100 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_sse2_single
1101 (t_nblist * gmx_restrict nlist,
1102 rvec * gmx_restrict xx,
1103 rvec * gmx_restrict ff,
1104 t_forcerec * gmx_restrict fr,
1105 t_mdatoms * gmx_restrict mdatoms,
1106 nb_kernel_data_t * gmx_restrict kernel_data,
1107 t_nrnb * gmx_restrict nrnb)
1109 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1110 * just 0 for non-waters.
1111 * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1112 * jnr indices corresponding to data put in the four positions in the SIMD register.
1114 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1115 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1116 int jnrA,jnrB,jnrC,jnrD;
1117 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1118 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1119 real shX,shY,shZ,rcutoff_scalar;
1120 real *shiftvec,*fshift,*x,*f;
1121 __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1123 __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1125 __m128 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1127 __m128 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1129 __m128 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1130 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1131 __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1132 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1133 __m128 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1134 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1135 __m128 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1136 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1137 __m128 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1138 __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1139 __m128 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1140 __m128 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1141 __m128 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1142 __m128 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1143 __m128 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1144 __m128 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1145 __m128 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1146 __m128 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1147 __m128 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1148 __m128 velec,felec,velecsum,facel,crf,krf,krf2;
1151 __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1154 __m128 one_sixth = _mm_set1_ps(1.0/6.0);
1155 __m128 one_twelfth = _mm_set1_ps(1.0/12.0);
1157 __m128i ifour = _mm_set1_epi32(4);
1158 __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1160 __m128 dummy_mask,cutoff_mask;
1161 __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1162 __m128 one = _mm_set1_ps(1.0);
1163 __m128 two = _mm_set1_ps(2.0);
1169 jindex = nlist->jindex;
1171 shiftidx = nlist->shift;
1173 shiftvec = fr->shift_vec[0];
1174 fshift = fr->fshift[0];
1175 facel = _mm_set1_ps(fr->epsfac);
1176 charge = mdatoms->chargeA;
1177 nvdwtype = fr->ntype;
1178 vdwparam = fr->nbfp;
1179 vdwtype = mdatoms->typeA;
1181 vftab = kernel_data->table_vdw->data;
1182 vftabscale = _mm_set1_ps(kernel_data->table_vdw->scale);
1184 /* Setup water-specific parameters */
1185 inr = nlist->iinr[0];
1186 iq1 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1187 iq2 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1188 iq3 = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1189 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
1191 jq1 = _mm_set1_ps(charge[inr+1]);
1192 jq2 = _mm_set1_ps(charge[inr+2]);
1193 jq3 = _mm_set1_ps(charge[inr+3]);
1194 vdwjidx0A = 2*vdwtype[inr+0];
1195 c6_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1196 c12_00 = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1197 qq11 = _mm_mul_ps(iq1,jq1);
1198 qq12 = _mm_mul_ps(iq1,jq2);
1199 qq13 = _mm_mul_ps(iq1,jq3);
1200 qq21 = _mm_mul_ps(iq2,jq1);
1201 qq22 = _mm_mul_ps(iq2,jq2);
1202 qq23 = _mm_mul_ps(iq2,jq3);
1203 qq31 = _mm_mul_ps(iq3,jq1);
1204 qq32 = _mm_mul_ps(iq3,jq2);
1205 qq33 = _mm_mul_ps(iq3,jq3);
1207 /* Avoid stupid compiler warnings */
1208 jnrA = jnrB = jnrC = jnrD = 0;
1209 j_coord_offsetA = 0;
1210 j_coord_offsetB = 0;
1211 j_coord_offsetC = 0;
1212 j_coord_offsetD = 0;
1217 /* Start outer loop over neighborlists */
1218 for(iidx=0; iidx<nri; iidx++)
1220 /* Load shift vector for this list */
1221 i_shift_offset = DIM*shiftidx[iidx];
1222 shX = shiftvec[i_shift_offset+XX];
1223 shY = shiftvec[i_shift_offset+YY];
1224 shZ = shiftvec[i_shift_offset+ZZ];
1226 /* Load limits for loop over neighbors */
1227 j_index_start = jindex[iidx];
1228 j_index_end = jindex[iidx+1];
1230 /* Get outer coordinate index */
1232 i_coord_offset = DIM*inr;
1234 /* Load i particle coords and add shift vector */
1235 ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1236 iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1237 iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1238 ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1239 iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1240 iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1241 ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1242 iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1243 iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1244 ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1245 iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1246 iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1248 fix0 = _mm_setzero_ps();
1249 fiy0 = _mm_setzero_ps();
1250 fiz0 = _mm_setzero_ps();
1251 fix1 = _mm_setzero_ps();
1252 fiy1 = _mm_setzero_ps();
1253 fiz1 = _mm_setzero_ps();
1254 fix2 = _mm_setzero_ps();
1255 fiy2 = _mm_setzero_ps();
1256 fiz2 = _mm_setzero_ps();
1257 fix3 = _mm_setzero_ps();
1258 fiy3 = _mm_setzero_ps();
1259 fiz3 = _mm_setzero_ps();
1261 /* Start inner kernel loop */
1262 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1265 /* Get j neighbor index, and coordinate index */
1267 jnrB = jjnr[jidx+1];
1268 jnrC = jjnr[jidx+2];
1269 jnrD = jjnr[jidx+3];
1271 j_coord_offsetA = DIM*jnrA;
1272 j_coord_offsetB = DIM*jnrB;
1273 j_coord_offsetC = DIM*jnrC;
1274 j_coord_offsetD = DIM*jnrD;
1276 /* load j atom coordinates */
1277 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1278 x+j_coord_offsetC,x+j_coord_offsetD,
1279 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1280 &jy2,&jz2,&jx3,&jy3,&jz3);
1282 /* Calculate displacement vector */
1283 dx00 = _mm_sub_ps(ix0,jx0);
1284 dy00 = _mm_sub_ps(iy0,jy0);
1285 dz00 = _mm_sub_ps(iz0,jz0);
1286 dx11 = _mm_sub_ps(ix1,jx1);
1287 dy11 = _mm_sub_ps(iy1,jy1);
1288 dz11 = _mm_sub_ps(iz1,jz1);
1289 dx12 = _mm_sub_ps(ix1,jx2);
1290 dy12 = _mm_sub_ps(iy1,jy2);
1291 dz12 = _mm_sub_ps(iz1,jz2);
1292 dx13 = _mm_sub_ps(ix1,jx3);
1293 dy13 = _mm_sub_ps(iy1,jy3);
1294 dz13 = _mm_sub_ps(iz1,jz3);
1295 dx21 = _mm_sub_ps(ix2,jx1);
1296 dy21 = _mm_sub_ps(iy2,jy1);
1297 dz21 = _mm_sub_ps(iz2,jz1);
1298 dx22 = _mm_sub_ps(ix2,jx2);
1299 dy22 = _mm_sub_ps(iy2,jy2);
1300 dz22 = _mm_sub_ps(iz2,jz2);
1301 dx23 = _mm_sub_ps(ix2,jx3);
1302 dy23 = _mm_sub_ps(iy2,jy3);
1303 dz23 = _mm_sub_ps(iz2,jz3);
1304 dx31 = _mm_sub_ps(ix3,jx1);
1305 dy31 = _mm_sub_ps(iy3,jy1);
1306 dz31 = _mm_sub_ps(iz3,jz1);
1307 dx32 = _mm_sub_ps(ix3,jx2);
1308 dy32 = _mm_sub_ps(iy3,jy2);
1309 dz32 = _mm_sub_ps(iz3,jz2);
1310 dx33 = _mm_sub_ps(ix3,jx3);
1311 dy33 = _mm_sub_ps(iy3,jy3);
1312 dz33 = _mm_sub_ps(iz3,jz3);
1314 /* Calculate squared distance and things based on it */
1315 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1316 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1317 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1318 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1319 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1320 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1321 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1322 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1323 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1324 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1326 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1327 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1328 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1329 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1330 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1331 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1332 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1333 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1334 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1335 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1337 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1338 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1339 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1340 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1341 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1342 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1343 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1344 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1345 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1347 fjx0 = _mm_setzero_ps();
1348 fjy0 = _mm_setzero_ps();
1349 fjz0 = _mm_setzero_ps();
1350 fjx1 = _mm_setzero_ps();
1351 fjy1 = _mm_setzero_ps();
1352 fjz1 = _mm_setzero_ps();
1353 fjx2 = _mm_setzero_ps();
1354 fjy2 = _mm_setzero_ps();
1355 fjz2 = _mm_setzero_ps();
1356 fjx3 = _mm_setzero_ps();
1357 fjy3 = _mm_setzero_ps();
1358 fjz3 = _mm_setzero_ps();
1360 /**************************
1361 * CALCULATE INTERACTIONS *
1362 **************************/
1364 r00 = _mm_mul_ps(rsq00,rinv00);
1366 /* Calculate table index by multiplying r with table scale and truncate to integer */
1367 rt = _mm_mul_ps(r00,vftabscale);
1368 vfitab = _mm_cvttps_epi32(rt);
1369 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1370 vfitab = _mm_slli_epi32(vfitab,3);
1372 /* CUBIC SPLINE TABLE DISPERSION */
1373 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1374 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1375 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1376 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1377 _MM_TRANSPOSE4_PS(Y,F,G,H);
1378 Heps = _mm_mul_ps(vfeps,H);
1379 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1380 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1381 fvdw6 = _mm_mul_ps(c6_00,FF);
1383 /* CUBIC SPLINE TABLE REPULSION */
1384 vfitab = _mm_add_epi32(vfitab,ifour);
1385 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1386 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1387 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1388 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1389 _MM_TRANSPOSE4_PS(Y,F,G,H);
1390 Heps = _mm_mul_ps(vfeps,H);
1391 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1392 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1393 fvdw12 = _mm_mul_ps(c12_00,FF);
1394 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1398 /* Calculate temporary vectorial force */
1399 tx = _mm_mul_ps(fscal,dx00);
1400 ty = _mm_mul_ps(fscal,dy00);
1401 tz = _mm_mul_ps(fscal,dz00);
1403 /* Update vectorial force */
1404 fix0 = _mm_add_ps(fix0,tx);
1405 fiy0 = _mm_add_ps(fiy0,ty);
1406 fiz0 = _mm_add_ps(fiz0,tz);
1408 fjx0 = _mm_add_ps(fjx0,tx);
1409 fjy0 = _mm_add_ps(fjy0,ty);
1410 fjz0 = _mm_add_ps(fjz0,tz);
1412 /**************************
1413 * CALCULATE INTERACTIONS *
1414 **************************/
1416 /* COULOMB ELECTROSTATICS */
1417 velec = _mm_mul_ps(qq11,rinv11);
1418 felec = _mm_mul_ps(velec,rinvsq11);
1422 /* Calculate temporary vectorial force */
1423 tx = _mm_mul_ps(fscal,dx11);
1424 ty = _mm_mul_ps(fscal,dy11);
1425 tz = _mm_mul_ps(fscal,dz11);
1427 /* Update vectorial force */
1428 fix1 = _mm_add_ps(fix1,tx);
1429 fiy1 = _mm_add_ps(fiy1,ty);
1430 fiz1 = _mm_add_ps(fiz1,tz);
1432 fjx1 = _mm_add_ps(fjx1,tx);
1433 fjy1 = _mm_add_ps(fjy1,ty);
1434 fjz1 = _mm_add_ps(fjz1,tz);
1436 /**************************
1437 * CALCULATE INTERACTIONS *
1438 **************************/
1440 /* COULOMB ELECTROSTATICS */
1441 velec = _mm_mul_ps(qq12,rinv12);
1442 felec = _mm_mul_ps(velec,rinvsq12);
1446 /* Calculate temporary vectorial force */
1447 tx = _mm_mul_ps(fscal,dx12);
1448 ty = _mm_mul_ps(fscal,dy12);
1449 tz = _mm_mul_ps(fscal,dz12);
1451 /* Update vectorial force */
1452 fix1 = _mm_add_ps(fix1,tx);
1453 fiy1 = _mm_add_ps(fiy1,ty);
1454 fiz1 = _mm_add_ps(fiz1,tz);
1456 fjx2 = _mm_add_ps(fjx2,tx);
1457 fjy2 = _mm_add_ps(fjy2,ty);
1458 fjz2 = _mm_add_ps(fjz2,tz);
1460 /**************************
1461 * CALCULATE INTERACTIONS *
1462 **************************/
1464 /* COULOMB ELECTROSTATICS */
1465 velec = _mm_mul_ps(qq13,rinv13);
1466 felec = _mm_mul_ps(velec,rinvsq13);
1470 /* Calculate temporary vectorial force */
1471 tx = _mm_mul_ps(fscal,dx13);
1472 ty = _mm_mul_ps(fscal,dy13);
1473 tz = _mm_mul_ps(fscal,dz13);
1475 /* Update vectorial force */
1476 fix1 = _mm_add_ps(fix1,tx);
1477 fiy1 = _mm_add_ps(fiy1,ty);
1478 fiz1 = _mm_add_ps(fiz1,tz);
1480 fjx3 = _mm_add_ps(fjx3,tx);
1481 fjy3 = _mm_add_ps(fjy3,ty);
1482 fjz3 = _mm_add_ps(fjz3,tz);
1484 /**************************
1485 * CALCULATE INTERACTIONS *
1486 **************************/
1488 /* COULOMB ELECTROSTATICS */
1489 velec = _mm_mul_ps(qq21,rinv21);
1490 felec = _mm_mul_ps(velec,rinvsq21);
1494 /* Calculate temporary vectorial force */
1495 tx = _mm_mul_ps(fscal,dx21);
1496 ty = _mm_mul_ps(fscal,dy21);
1497 tz = _mm_mul_ps(fscal,dz21);
1499 /* Update vectorial force */
1500 fix2 = _mm_add_ps(fix2,tx);
1501 fiy2 = _mm_add_ps(fiy2,ty);
1502 fiz2 = _mm_add_ps(fiz2,tz);
1504 fjx1 = _mm_add_ps(fjx1,tx);
1505 fjy1 = _mm_add_ps(fjy1,ty);
1506 fjz1 = _mm_add_ps(fjz1,tz);
1508 /**************************
1509 * CALCULATE INTERACTIONS *
1510 **************************/
1512 /* COULOMB ELECTROSTATICS */
1513 velec = _mm_mul_ps(qq22,rinv22);
1514 felec = _mm_mul_ps(velec,rinvsq22);
1518 /* Calculate temporary vectorial force */
1519 tx = _mm_mul_ps(fscal,dx22);
1520 ty = _mm_mul_ps(fscal,dy22);
1521 tz = _mm_mul_ps(fscal,dz22);
1523 /* Update vectorial force */
1524 fix2 = _mm_add_ps(fix2,tx);
1525 fiy2 = _mm_add_ps(fiy2,ty);
1526 fiz2 = _mm_add_ps(fiz2,tz);
1528 fjx2 = _mm_add_ps(fjx2,tx);
1529 fjy2 = _mm_add_ps(fjy2,ty);
1530 fjz2 = _mm_add_ps(fjz2,tz);
1532 /**************************
1533 * CALCULATE INTERACTIONS *
1534 **************************/
1536 /* COULOMB ELECTROSTATICS */
1537 velec = _mm_mul_ps(qq23,rinv23);
1538 felec = _mm_mul_ps(velec,rinvsq23);
1542 /* Calculate temporary vectorial force */
1543 tx = _mm_mul_ps(fscal,dx23);
1544 ty = _mm_mul_ps(fscal,dy23);
1545 tz = _mm_mul_ps(fscal,dz23);
1547 /* Update vectorial force */
1548 fix2 = _mm_add_ps(fix2,tx);
1549 fiy2 = _mm_add_ps(fiy2,ty);
1550 fiz2 = _mm_add_ps(fiz2,tz);
1552 fjx3 = _mm_add_ps(fjx3,tx);
1553 fjy3 = _mm_add_ps(fjy3,ty);
1554 fjz3 = _mm_add_ps(fjz3,tz);
1556 /**************************
1557 * CALCULATE INTERACTIONS *
1558 **************************/
1560 /* COULOMB ELECTROSTATICS */
1561 velec = _mm_mul_ps(qq31,rinv31);
1562 felec = _mm_mul_ps(velec,rinvsq31);
1566 /* Calculate temporary vectorial force */
1567 tx = _mm_mul_ps(fscal,dx31);
1568 ty = _mm_mul_ps(fscal,dy31);
1569 tz = _mm_mul_ps(fscal,dz31);
1571 /* Update vectorial force */
1572 fix3 = _mm_add_ps(fix3,tx);
1573 fiy3 = _mm_add_ps(fiy3,ty);
1574 fiz3 = _mm_add_ps(fiz3,tz);
1576 fjx1 = _mm_add_ps(fjx1,tx);
1577 fjy1 = _mm_add_ps(fjy1,ty);
1578 fjz1 = _mm_add_ps(fjz1,tz);
1580 /**************************
1581 * CALCULATE INTERACTIONS *
1582 **************************/
1584 /* COULOMB ELECTROSTATICS */
1585 velec = _mm_mul_ps(qq32,rinv32);
1586 felec = _mm_mul_ps(velec,rinvsq32);
1590 /* Calculate temporary vectorial force */
1591 tx = _mm_mul_ps(fscal,dx32);
1592 ty = _mm_mul_ps(fscal,dy32);
1593 tz = _mm_mul_ps(fscal,dz32);
1595 /* Update vectorial force */
1596 fix3 = _mm_add_ps(fix3,tx);
1597 fiy3 = _mm_add_ps(fiy3,ty);
1598 fiz3 = _mm_add_ps(fiz3,tz);
1600 fjx2 = _mm_add_ps(fjx2,tx);
1601 fjy2 = _mm_add_ps(fjy2,ty);
1602 fjz2 = _mm_add_ps(fjz2,tz);
1604 /**************************
1605 * CALCULATE INTERACTIONS *
1606 **************************/
1608 /* COULOMB ELECTROSTATICS */
1609 velec = _mm_mul_ps(qq33,rinv33);
1610 felec = _mm_mul_ps(velec,rinvsq33);
1614 /* Calculate temporary vectorial force */
1615 tx = _mm_mul_ps(fscal,dx33);
1616 ty = _mm_mul_ps(fscal,dy33);
1617 tz = _mm_mul_ps(fscal,dz33);
1619 /* Update vectorial force */
1620 fix3 = _mm_add_ps(fix3,tx);
1621 fiy3 = _mm_add_ps(fiy3,ty);
1622 fiz3 = _mm_add_ps(fiz3,tz);
1624 fjx3 = _mm_add_ps(fjx3,tx);
1625 fjy3 = _mm_add_ps(fjy3,ty);
1626 fjz3 = _mm_add_ps(fjz3,tz);
1628 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1629 f+j_coord_offsetC,f+j_coord_offsetD,
1630 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1631 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1633 /* Inner loop uses 294 flops */
1636 if(jidx<j_index_end)
1639 /* Get j neighbor index, and coordinate index */
1641 jnrB = jjnr[jidx+1];
1642 jnrC = jjnr[jidx+2];
1643 jnrD = jjnr[jidx+3];
1645 /* Sign of each element will be negative for non-real atoms.
1646 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1647 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1649 dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1650 jnrA = (jnrA>=0) ? jnrA : 0;
1651 jnrB = (jnrB>=0) ? jnrB : 0;
1652 jnrC = (jnrC>=0) ? jnrC : 0;
1653 jnrD = (jnrD>=0) ? jnrD : 0;
1655 j_coord_offsetA = DIM*jnrA;
1656 j_coord_offsetB = DIM*jnrB;
1657 j_coord_offsetC = DIM*jnrC;
1658 j_coord_offsetD = DIM*jnrD;
1660 /* load j atom coordinates */
1661 gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1662 x+j_coord_offsetC,x+j_coord_offsetD,
1663 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1664 &jy2,&jz2,&jx3,&jy3,&jz3);
1666 /* Calculate displacement vector */
1667 dx00 = _mm_sub_ps(ix0,jx0);
1668 dy00 = _mm_sub_ps(iy0,jy0);
1669 dz00 = _mm_sub_ps(iz0,jz0);
1670 dx11 = _mm_sub_ps(ix1,jx1);
1671 dy11 = _mm_sub_ps(iy1,jy1);
1672 dz11 = _mm_sub_ps(iz1,jz1);
1673 dx12 = _mm_sub_ps(ix1,jx2);
1674 dy12 = _mm_sub_ps(iy1,jy2);
1675 dz12 = _mm_sub_ps(iz1,jz2);
1676 dx13 = _mm_sub_ps(ix1,jx3);
1677 dy13 = _mm_sub_ps(iy1,jy3);
1678 dz13 = _mm_sub_ps(iz1,jz3);
1679 dx21 = _mm_sub_ps(ix2,jx1);
1680 dy21 = _mm_sub_ps(iy2,jy1);
1681 dz21 = _mm_sub_ps(iz2,jz1);
1682 dx22 = _mm_sub_ps(ix2,jx2);
1683 dy22 = _mm_sub_ps(iy2,jy2);
1684 dz22 = _mm_sub_ps(iz2,jz2);
1685 dx23 = _mm_sub_ps(ix2,jx3);
1686 dy23 = _mm_sub_ps(iy2,jy3);
1687 dz23 = _mm_sub_ps(iz2,jz3);
1688 dx31 = _mm_sub_ps(ix3,jx1);
1689 dy31 = _mm_sub_ps(iy3,jy1);
1690 dz31 = _mm_sub_ps(iz3,jz1);
1691 dx32 = _mm_sub_ps(ix3,jx2);
1692 dy32 = _mm_sub_ps(iy3,jy2);
1693 dz32 = _mm_sub_ps(iz3,jz2);
1694 dx33 = _mm_sub_ps(ix3,jx3);
1695 dy33 = _mm_sub_ps(iy3,jy3);
1696 dz33 = _mm_sub_ps(iz3,jz3);
1698 /* Calculate squared distance and things based on it */
1699 rsq00 = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1700 rsq11 = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1701 rsq12 = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1702 rsq13 = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1703 rsq21 = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1704 rsq22 = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1705 rsq23 = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1706 rsq31 = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1707 rsq32 = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1708 rsq33 = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1710 rinv00 = gmx_mm_invsqrt_ps(rsq00);
1711 rinv11 = gmx_mm_invsqrt_ps(rsq11);
1712 rinv12 = gmx_mm_invsqrt_ps(rsq12);
1713 rinv13 = gmx_mm_invsqrt_ps(rsq13);
1714 rinv21 = gmx_mm_invsqrt_ps(rsq21);
1715 rinv22 = gmx_mm_invsqrt_ps(rsq22);
1716 rinv23 = gmx_mm_invsqrt_ps(rsq23);
1717 rinv31 = gmx_mm_invsqrt_ps(rsq31);
1718 rinv32 = gmx_mm_invsqrt_ps(rsq32);
1719 rinv33 = gmx_mm_invsqrt_ps(rsq33);
1721 rinvsq11 = _mm_mul_ps(rinv11,rinv11);
1722 rinvsq12 = _mm_mul_ps(rinv12,rinv12);
1723 rinvsq13 = _mm_mul_ps(rinv13,rinv13);
1724 rinvsq21 = _mm_mul_ps(rinv21,rinv21);
1725 rinvsq22 = _mm_mul_ps(rinv22,rinv22);
1726 rinvsq23 = _mm_mul_ps(rinv23,rinv23);
1727 rinvsq31 = _mm_mul_ps(rinv31,rinv31);
1728 rinvsq32 = _mm_mul_ps(rinv32,rinv32);
1729 rinvsq33 = _mm_mul_ps(rinv33,rinv33);
1731 fjx0 = _mm_setzero_ps();
1732 fjy0 = _mm_setzero_ps();
1733 fjz0 = _mm_setzero_ps();
1734 fjx1 = _mm_setzero_ps();
1735 fjy1 = _mm_setzero_ps();
1736 fjz1 = _mm_setzero_ps();
1737 fjx2 = _mm_setzero_ps();
1738 fjy2 = _mm_setzero_ps();
1739 fjz2 = _mm_setzero_ps();
1740 fjx3 = _mm_setzero_ps();
1741 fjy3 = _mm_setzero_ps();
1742 fjz3 = _mm_setzero_ps();
1744 /**************************
1745 * CALCULATE INTERACTIONS *
1746 **************************/
1748 r00 = _mm_mul_ps(rsq00,rinv00);
1749 r00 = _mm_andnot_ps(dummy_mask,r00);
1751 /* Calculate table index by multiplying r with table scale and truncate to integer */
1752 rt = _mm_mul_ps(r00,vftabscale);
1753 vfitab = _mm_cvttps_epi32(rt);
1754 vfeps = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1755 vfitab = _mm_slli_epi32(vfitab,3);
1757 /* CUBIC SPLINE TABLE DISPERSION */
1758 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1759 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1760 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1761 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1762 _MM_TRANSPOSE4_PS(Y,F,G,H);
1763 Heps = _mm_mul_ps(vfeps,H);
1764 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1765 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1766 fvdw6 = _mm_mul_ps(c6_00,FF);
1768 /* CUBIC SPLINE TABLE REPULSION */
1769 vfitab = _mm_add_epi32(vfitab,ifour);
1770 Y = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1771 F = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1772 G = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1773 H = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1774 _MM_TRANSPOSE4_PS(Y,F,G,H);
1775 Heps = _mm_mul_ps(vfeps,H);
1776 Fp = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1777 FF = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1778 fvdw12 = _mm_mul_ps(c12_00,FF);
1779 fvdw = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1783 fscal = _mm_andnot_ps(dummy_mask,fscal);
1785 /* Calculate temporary vectorial force */
1786 tx = _mm_mul_ps(fscal,dx00);
1787 ty = _mm_mul_ps(fscal,dy00);
1788 tz = _mm_mul_ps(fscal,dz00);
1790 /* Update vectorial force */
1791 fix0 = _mm_add_ps(fix0,tx);
1792 fiy0 = _mm_add_ps(fiy0,ty);
1793 fiz0 = _mm_add_ps(fiz0,tz);
1795 fjx0 = _mm_add_ps(fjx0,tx);
1796 fjy0 = _mm_add_ps(fjy0,ty);
1797 fjz0 = _mm_add_ps(fjz0,tz);
1799 /**************************
1800 * CALCULATE INTERACTIONS *
1801 **************************/
1803 /* COULOMB ELECTROSTATICS */
1804 velec = _mm_mul_ps(qq11,rinv11);
1805 felec = _mm_mul_ps(velec,rinvsq11);
1809 fscal = _mm_andnot_ps(dummy_mask,fscal);
1811 /* Calculate temporary vectorial force */
1812 tx = _mm_mul_ps(fscal,dx11);
1813 ty = _mm_mul_ps(fscal,dy11);
1814 tz = _mm_mul_ps(fscal,dz11);
1816 /* Update vectorial force */
1817 fix1 = _mm_add_ps(fix1,tx);
1818 fiy1 = _mm_add_ps(fiy1,ty);
1819 fiz1 = _mm_add_ps(fiz1,tz);
1821 fjx1 = _mm_add_ps(fjx1,tx);
1822 fjy1 = _mm_add_ps(fjy1,ty);
1823 fjz1 = _mm_add_ps(fjz1,tz);
1825 /**************************
1826 * CALCULATE INTERACTIONS *
1827 **************************/
1829 /* COULOMB ELECTROSTATICS */
1830 velec = _mm_mul_ps(qq12,rinv12);
1831 felec = _mm_mul_ps(velec,rinvsq12);
1835 fscal = _mm_andnot_ps(dummy_mask,fscal);
1837 /* Calculate temporary vectorial force */
1838 tx = _mm_mul_ps(fscal,dx12);
1839 ty = _mm_mul_ps(fscal,dy12);
1840 tz = _mm_mul_ps(fscal,dz12);
1842 /* Update vectorial force */
1843 fix1 = _mm_add_ps(fix1,tx);
1844 fiy1 = _mm_add_ps(fiy1,ty);
1845 fiz1 = _mm_add_ps(fiz1,tz);
1847 fjx2 = _mm_add_ps(fjx2,tx);
1848 fjy2 = _mm_add_ps(fjy2,ty);
1849 fjz2 = _mm_add_ps(fjz2,tz);
1851 /**************************
1852 * CALCULATE INTERACTIONS *
1853 **************************/
1855 /* COULOMB ELECTROSTATICS */
1856 velec = _mm_mul_ps(qq13,rinv13);
1857 felec = _mm_mul_ps(velec,rinvsq13);
1861 fscal = _mm_andnot_ps(dummy_mask,fscal);
1863 /* Calculate temporary vectorial force */
1864 tx = _mm_mul_ps(fscal,dx13);
1865 ty = _mm_mul_ps(fscal,dy13);
1866 tz = _mm_mul_ps(fscal,dz13);
1868 /* Update vectorial force */
1869 fix1 = _mm_add_ps(fix1,tx);
1870 fiy1 = _mm_add_ps(fiy1,ty);
1871 fiz1 = _mm_add_ps(fiz1,tz);
1873 fjx3 = _mm_add_ps(fjx3,tx);
1874 fjy3 = _mm_add_ps(fjy3,ty);
1875 fjz3 = _mm_add_ps(fjz3,tz);
1877 /**************************
1878 * CALCULATE INTERACTIONS *
1879 **************************/
1881 /* COULOMB ELECTROSTATICS */
1882 velec = _mm_mul_ps(qq21,rinv21);
1883 felec = _mm_mul_ps(velec,rinvsq21);
1887 fscal = _mm_andnot_ps(dummy_mask,fscal);
1889 /* Calculate temporary vectorial force */
1890 tx = _mm_mul_ps(fscal,dx21);
1891 ty = _mm_mul_ps(fscal,dy21);
1892 tz = _mm_mul_ps(fscal,dz21);
1894 /* Update vectorial force */
1895 fix2 = _mm_add_ps(fix2,tx);
1896 fiy2 = _mm_add_ps(fiy2,ty);
1897 fiz2 = _mm_add_ps(fiz2,tz);
1899 fjx1 = _mm_add_ps(fjx1,tx);
1900 fjy1 = _mm_add_ps(fjy1,ty);
1901 fjz1 = _mm_add_ps(fjz1,tz);
1903 /**************************
1904 * CALCULATE INTERACTIONS *
1905 **************************/
1907 /* COULOMB ELECTROSTATICS */
1908 velec = _mm_mul_ps(qq22,rinv22);
1909 felec = _mm_mul_ps(velec,rinvsq22);
1913 fscal = _mm_andnot_ps(dummy_mask,fscal);
1915 /* Calculate temporary vectorial force */
1916 tx = _mm_mul_ps(fscal,dx22);
1917 ty = _mm_mul_ps(fscal,dy22);
1918 tz = _mm_mul_ps(fscal,dz22);
1920 /* Update vectorial force */
1921 fix2 = _mm_add_ps(fix2,tx);
1922 fiy2 = _mm_add_ps(fiy2,ty);
1923 fiz2 = _mm_add_ps(fiz2,tz);
1925 fjx2 = _mm_add_ps(fjx2,tx);
1926 fjy2 = _mm_add_ps(fjy2,ty);
1927 fjz2 = _mm_add_ps(fjz2,tz);
1929 /**************************
1930 * CALCULATE INTERACTIONS *
1931 **************************/
1933 /* COULOMB ELECTROSTATICS */
1934 velec = _mm_mul_ps(qq23,rinv23);
1935 felec = _mm_mul_ps(velec,rinvsq23);
1939 fscal = _mm_andnot_ps(dummy_mask,fscal);
1941 /* Calculate temporary vectorial force */
1942 tx = _mm_mul_ps(fscal,dx23);
1943 ty = _mm_mul_ps(fscal,dy23);
1944 tz = _mm_mul_ps(fscal,dz23);
1946 /* Update vectorial force */
1947 fix2 = _mm_add_ps(fix2,tx);
1948 fiy2 = _mm_add_ps(fiy2,ty);
1949 fiz2 = _mm_add_ps(fiz2,tz);
1951 fjx3 = _mm_add_ps(fjx3,tx);
1952 fjy3 = _mm_add_ps(fjy3,ty);
1953 fjz3 = _mm_add_ps(fjz3,tz);
1955 /**************************
1956 * CALCULATE INTERACTIONS *
1957 **************************/
1959 /* COULOMB ELECTROSTATICS */
1960 velec = _mm_mul_ps(qq31,rinv31);
1961 felec = _mm_mul_ps(velec,rinvsq31);
1965 fscal = _mm_andnot_ps(dummy_mask,fscal);
1967 /* Calculate temporary vectorial force */
1968 tx = _mm_mul_ps(fscal,dx31);
1969 ty = _mm_mul_ps(fscal,dy31);
1970 tz = _mm_mul_ps(fscal,dz31);
1972 /* Update vectorial force */
1973 fix3 = _mm_add_ps(fix3,tx);
1974 fiy3 = _mm_add_ps(fiy3,ty);
1975 fiz3 = _mm_add_ps(fiz3,tz);
1977 fjx1 = _mm_add_ps(fjx1,tx);
1978 fjy1 = _mm_add_ps(fjy1,ty);
1979 fjz1 = _mm_add_ps(fjz1,tz);
1981 /**************************
1982 * CALCULATE INTERACTIONS *
1983 **************************/
1985 /* COULOMB ELECTROSTATICS */
1986 velec = _mm_mul_ps(qq32,rinv32);
1987 felec = _mm_mul_ps(velec,rinvsq32);
1991 fscal = _mm_andnot_ps(dummy_mask,fscal);
1993 /* Calculate temporary vectorial force */
1994 tx = _mm_mul_ps(fscal,dx32);
1995 ty = _mm_mul_ps(fscal,dy32);
1996 tz = _mm_mul_ps(fscal,dz32);
1998 /* Update vectorial force */
1999 fix3 = _mm_add_ps(fix3,tx);
2000 fiy3 = _mm_add_ps(fiy3,ty);
2001 fiz3 = _mm_add_ps(fiz3,tz);
2003 fjx2 = _mm_add_ps(fjx2,tx);
2004 fjy2 = _mm_add_ps(fjy2,ty);
2005 fjz2 = _mm_add_ps(fjz2,tz);
2007 /**************************
2008 * CALCULATE INTERACTIONS *
2009 **************************/
2011 /* COULOMB ELECTROSTATICS */
2012 velec = _mm_mul_ps(qq33,rinv33);
2013 felec = _mm_mul_ps(velec,rinvsq33);
2017 fscal = _mm_andnot_ps(dummy_mask,fscal);
2019 /* Calculate temporary vectorial force */
2020 tx = _mm_mul_ps(fscal,dx33);
2021 ty = _mm_mul_ps(fscal,dy33);
2022 tz = _mm_mul_ps(fscal,dz33);
2024 /* Update vectorial force */
2025 fix3 = _mm_add_ps(fix3,tx);
2026 fiy3 = _mm_add_ps(fiy3,ty);
2027 fiz3 = _mm_add_ps(fiz3,tz);
2029 fjx3 = _mm_add_ps(fjx3,tx);
2030 fjy3 = _mm_add_ps(fjy3,ty);
2031 fjz3 = _mm_add_ps(fjz3,tz);
2033 gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2034 f+j_coord_offsetC,f+j_coord_offsetD,
2035 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2036 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2038 /* Inner loop uses 295 flops */
2041 /* End of innermost loop */
2043 gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2044 f+i_coord_offset,fshift+i_shift_offset);
2046 /* Increment number of inner iterations */
2047 inneriter += j_index_end - j_index_start;
2049 /* Outer loop uses 36 flops */
2052 /* Increment number of outer iterations */
2055 /* Update outer/inner flops */
2057 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*295);