*/
for(i=0; i<ntype; i++)
{
- c6 = nbfp[(i*ntype+i)*2 ];
- c12 = nbfp[(i*ntype+i)*2+1];
+ c6 = nbfp[(i*ntype+i)*2 ]/6.0;
+ c12 = nbfp[(i*ntype+i)*2+1]/12.0;
if (c6 > 0 && c12 > 0)
{
nbat->nbfp_comb[i*2 ] = pow(c12/c6,1.0/6.0);
c12 = nbfp[(i*ntype+j)*2+1];
nbat->nbfp[(i*nbat->ntype+j)*2 ] = c6;
nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12;
- c6 /= 6.0;
- c12 /= 12.0;
-
+
+ /* Compare 6*C6 and 12*C12 for geometric cobination rule */
bCombGeom = bCombGeom &&
gmx_within_tol(c6*c6 ,nbfp[(i*ntype+i)*2 ]*nbfp[(j*ntype+j)*2 ],tol) &&
gmx_within_tol(c12*c12,nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1],tol);
+ /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
+ c6 /= 6.0;
+ c12 /= 12.0;
bCombLB = bCombLB &&
((c6 == 0 && c12 == 0 &&
(nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
static void
nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
- nbnxn_atomdata_output_t * gmx_restrict src,
+ real ** gmx_restrict src,
int nsrc,
int i0, int i1)
{
{
for(s=0; s<nsrc; s++)
{
- dest[i] += src[s].f[i];
+ dest[i] += src[s][i];
}
}
}
static void
nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
- nbnxn_atomdata_output_t * gmx_restrict src,
+ real ** gmx_restrict src,
int nsrc,
int i0, int i1)
{
#ifdef NBNXN_SEARCH_SSE
-#ifdef GMX_X86_AVX_256
-#define GMX_MM256_HERE
-#else
+/* We can use AVX256 here, but not when AVX128 kernels are selected.
+ * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
+ */
#define GMX_MM128_HERE
-#endif
#include "gmx_x86_simd_macros.h"
int i,s;
dest_SSE = gmx_load_pr(dest+i);
for(s=0; s<nsrc; s++)
{
- src_SSE = gmx_load_pr(src[s].f+i);
+ src_SSE = gmx_load_pr(src[s]+i);
dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
}
gmx_store_pr(dest+i,dest_SSE);
#pragma omp parallel for num_threads(nth) schedule(static)
for(th=0; th<nth; th++)
{
- int g0,g1;
- int b0,b1,nb;
- int blocksize,i0,i1;
+ int g0,g1,g;
/* For which grids should we reduce the force output? */
g0 = ((locality==eatLocal || locality==eatAll) ? 0 : 1);
g1 = (locality==eatLocal ? 1 : nbs->ngrid);
- /* Get the grid cell bounds */
- b0 = nbs->grid[g0].cell0;
- b1 = nbs->grid[g1-1].cell0 + nbs->grid[g1-1].nc;
- blocksize = nbs->grid[g0].na_sc*nbat->fstride;
- /* The simple grid size in atoms is a multiple of na_cj.
- * With float-AVX256 we use this and make blocksize a multiple of 8.
- */
- if (nbs->grid[0].bSimple && nbs->grid[0].na_cj > nbs->grid[0].na_c)
+ for(g=g0; g<g1; g++)
{
- blocksize *= 2;
- b0 /= 2;
- b1 /= 2;
- }
- nb = b1 - b0;
+ nbnxn_grid_t *grid;
+ int b0,b1,b;
+ int c0,c1,i0,i1;
+ int nfptr;
+ real *fptr[NBNXN_CELLBLOCK_MAX_THREADS];
+ int out;
+
+ grid = &nbs->grid[g];
- /* Calculate the index range for our thread */
- i0 = (b0 + (nb* th )/nth)*blocksize;
- i1 = (b0 + (nb*(th+1))/nth)*blocksize;
+ /* Calculate the cell-block range for our thread */
+ b0 = (grid->cellblock_flags.ncb* th )/nth;
+ b1 = (grid->cellblock_flags.ncb*(th+1))/nth;
+ if (grid->cellblock_flags.bUse)
+ {
+ for(b=b0; b<b1; b++)
+ {
+ c0 = b*NBNXN_CELLBLOCK_SIZE;
+ c1 = min(c0 + NBNXN_CELLBLOCK_SIZE,grid->nc);
+ i0 = (grid->cell0 + c0)*grid->na_c*nbat->fstride;
+ i1 = (grid->cell0 + c1)*grid->na_c*nbat->fstride;
+
+ nfptr = 0;
+ for(out=1; out<nbat->nout; out++)
+ {
+ if (grid->cellblock_flags.flag[b] & (1U<<out))
+ {
+ fptr[nfptr++] = nbat->out[out].f;
+ }
+ }
+ if (nfptr > 0)
+ {
#ifdef NBNXN_SEARCH_SSE
- nbnxn_atomdata_reduce_reals_x86_simd(
+ nbnxn_atomdata_reduce_reals_x86_simd
#else
- nbnxn_atomdata_reduce_reals(
+ nbnxn_atomdata_reduce_reals
#endif
- nbat->out[0].f,
- nbat->out+1,nbat->nout - 1,
- i0,i1);
+ (nbat->out[0].f,
+ fptr,nfptr,
+ i0,i1);
+ }
+ }
+ }
+ else
+ {
+ c0 = b0*NBNXN_CELLBLOCK_SIZE;
+ c1 = min(b1*NBNXN_CELLBLOCK_SIZE,grid->nc);
+ i0 = (grid->cell0 + c0)*grid->na_c*nbat->fstride;
+ i1 = (grid->cell0 + c1)*grid->na_c*nbat->fstride;
+
+ nfptr = 0;
+ for(out=1; out<nbat->nout; out++)
+ {
+ fptr[nfptr++] = nbat->out[out].f;
+ }
+
+#ifdef NBNXN_SEARCH_SSE
+ nbnxn_atomdata_reduce_reals_x86_simd
+#else
+ nbnxn_atomdata_reduce_reals
+#endif
+ (nbat->out[0].f,
+ fptr,nfptr,
+ i0,i1);
+ }
+ }
}
}