int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 38 flops */
+ /* Outer loop uses 26 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*368);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*368);
}
/*
* Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW4W4_F_sse2_single
int i_shift_offset,i_coord_offset,outeriter,inneriter;
int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
int jnrA,jnrB,jnrC,jnrD;
+ int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
int *iinr,*jindex,*jjnr,*shiftidx,*gid;
- real shX,shY,shZ,rcutoff_scalar;
+ real rcutoff_scalar;
real *shiftvec,*fshift,*x,*f;
+ real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
+ real scratch[4*DIM];
__m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
int vdwioffset0;
__m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
outeriter = 0;
inneriter = 0;
+ for(iidx=0;iidx<4*DIM;iidx++)
+ {
+ scratch[iidx] = 0.0;
+ }
+
/* Start outer loop over neighborlists */
for(iidx=0; iidx<nri; iidx++)
{
/* Load shift vector for this list */
i_shift_offset = DIM*shiftidx[iidx];
- shX = shiftvec[i_shift_offset+XX];
- shY = shiftvec[i_shift_offset+YY];
- shZ = shiftvec[i_shift_offset+ZZ];
/* Load limits for loop over neighbors */
j_index_start = jindex[iidx];
i_coord_offset = DIM*inr;
/* Load i particle coords and add shift vector */
- ix0 = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
- iy0 = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
- iz0 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
- ix1 = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
- iy1 = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
- iz1 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
- ix2 = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
- iy2 = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
- iz2 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
- ix3 = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
- iy3 = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
- iz3 = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
-
+ gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
+ &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
+
fix0 = _mm_setzero_ps();
fiy0 = _mm_setzero_ps();
fiz0 = _mm_setzero_ps();
jnrB = jjnr[jidx+1];
jnrC = jjnr[jidx+2];
jnrD = jjnr[jidx+3];
-
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = f+j_coord_offsetA;
+ fjptrB = f+j_coord_offsetB;
+ fjptrC = f+j_coord_offsetC;
+ fjptrD = f+j_coord_offsetD;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
{
/* Get j neighbor index, and coordinate index */
- jnrA = jjnr[jidx];
- jnrB = jjnr[jidx+1];
- jnrC = jjnr[jidx+2];
- jnrD = jjnr[jidx+3];
-
+ jnrlistA = jjnr[jidx];
+ jnrlistB = jjnr[jidx+1];
+ jnrlistC = jjnr[jidx+2];
+ jnrlistD = jjnr[jidx+3];
/* Sign of each element will be negative for non-real atoms.
* This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
* so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
*/
dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
- jnrA = (jnrA>=0) ? jnrA : 0;
- jnrB = (jnrB>=0) ? jnrB : 0;
- jnrC = (jnrC>=0) ? jnrC : 0;
- jnrD = (jnrD>=0) ? jnrD : 0;
-
+ jnrA = (jnrlistA>=0) ? jnrlistA : 0;
+ jnrB = (jnrlistB>=0) ? jnrlistB : 0;
+ jnrC = (jnrlistC>=0) ? jnrlistC : 0;
+ jnrD = (jnrlistD>=0) ? jnrlistD : 0;
j_coord_offsetA = DIM*jnrA;
j_coord_offsetB = DIM*jnrB;
j_coord_offsetC = DIM*jnrC;
fjx0 = _mm_add_ps(fjx0,tx);
fjy0 = _mm_add_ps(fjy0,ty);
fjz0 = _mm_add_ps(fjz0,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
/**************************
fjx1 = _mm_add_ps(fjx1,tx);
fjy1 = _mm_add_ps(fjy1,ty);
fjz1 = _mm_add_ps(fjz1,tz);
-
+
}
/**************************
fjx2 = _mm_add_ps(fjx2,tx);
fjy2 = _mm_add_ps(fjy2,ty);
fjz2 = _mm_add_ps(fjz2,tz);
-
+
}
/**************************
fjx3 = _mm_add_ps(fjx3,tx);
fjy3 = _mm_add_ps(fjy3,ty);
fjz3 = _mm_add_ps(fjz3,tz);
-
+
}
- gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
- f+j_coord_offsetC,f+j_coord_offsetD,
+ fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
+ fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
+ fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
+ fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
+
+ gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
/* Increment number of inner iterations */
inneriter += j_index_end - j_index_start;
- /* Outer loop uses 36 flops */
+ /* Outer loop uses 24 flops */
}
/* Increment number of outer iterations */
/* Update outer/inner flops */
- inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*303);
+ inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*303);
}