Fixed two energy group indexing errors in the nbnxn CPU kernels.
The code has now been generalized and we avoid code duplication.
Change-Id: I524d0d2c86b54a92ae124f2c067ccef3daac1515
if (!simple)
{
/* Energy groups not supported yet for super-sub lists */
+ if (n_energygroups > 1 && fp != NULL)
+ {
+ fprintf(fp,"\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
+ }
nbat->nenergrp = 1;
}
- /* Temporary storage goes is #grp^3*8 real, so limit to 64 */
+ /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
if (nbat->nenergrp > 64)
{
gmx_fatal(FARGS,"With NxN kernels not more than 64 energy groups are supported\n");
j = 0;
for(i=0; i<na; i+=na_c)
{
- /* Store na_c energy groups number into one int */
+ /* Store na_c energy group numbers into one int */
comb = 0;
for(sa=0; sa<na_c; sa++)
{
}
}
+void
+reduce_energies_over_lists(const nbnxn_atomdata_t *nbat,
+ int nlist,
+ real *Vvdw,
+ real *Vc)
+{
+ int nb;
+ int i,j,ind,indr;
+
+ for(nb=0; nb<nlist; nb++)
+ {
+ for(i=0; i<nbat->nenergrp; i++)
+ {
+ /* Reduce the diagonal terms */
+ ind = i*nbat->nenergrp + i;
+ Vvdw[ind] += nbat->out[nb].Vvdw[ind];
+ Vc[ind] += nbat->out[nb].Vc[ind];
+
+ /* Reduce the off-diagonal terms */
+ for(j=i+1; j<nbat->nenergrp; j++)
+ {
+ /* The output should contain only one off-diagonal part */
+ ind = i*nbat->nenergrp + j;
+ indr = j*nbat->nenergrp + i;
+ Vvdw[ind] += nbat->out[nb].Vvdw[ind] + nbat->out[nb].Vvdw[indr];
+ Vc[ind] += nbat->out[nb].Vc[ind] + nbat->out[nb].Vc[indr];
+ }
+ }
+ }
+}
void
clear_fshift(real *fshift);
+void
+reduce_energies_over_lists(const nbnxn_atomdata_t *nbat,
+ int nlist,
+ real *Vvdw,
+ real *Vc);
+
#if 0
{
#endif
if (force_flags & GMX_FORCE_ENERGY)
{
- /* Reduce the energies */
- for(nb=0; nb<nnbl; nb++)
- {
- int i;
-
- for(i=0; i<nbat->out[nb].nV; i++)
- {
- Vvdw[i] += nbat->out[nb].Vvdw[i];
- Vc[i] += nbat->out[nb].Vc[i];
- }
- }
+ reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
}
}
nbnxn_kernel_x86_simd128_tab_twin_comb_lb_noener,
nbnxn_kernel_x86_simd128_tab_twin_comb_none_noener } };
-#endif /* SSE */
-
static void reduce_group_energies(int ng,int ng_2log,
const real *VSvdw,const real *VSc,
{
int ng_p2,i,j,j0,j1,c,s;
-#ifndef GMX_DOUBLE
-#define SIMD_WIDTH 4
-#define SIMD_WIDTH_2 2
-#else
-#define SIMD_WIDTH 2
-#define SIMD_WIDTH_2 1
-#endif
+#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE)
+#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2)
ng_p2 = (1<<ng_2log);
- /* The size of the SSE energy group buffer array is:
- * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+ /* The size of the x86 SIMD energy group buffer array is:
+ * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
*/
for(i=0; i<ng; i++)
{
{
for(j0=0; j0<ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
- for(s=0; s<SIMD_WIDTH_2; s++)
+ c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
+ for(s=0; s<SIMD_WIDTH_HALF; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
}
}
+#endif /* GMX_X86_SSE2 */
+
void
nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t *nbl_list,
const nbnxn_atomdata_t *nbat,
if (force_flags & GMX_FORCE_ENERGY)
{
- /* Reduce the energies */
- for(nb=0; nb<nnbl; nb++)
- {
- int i;
-
- for(i=0; i<nbat->out[nb].nV; i++)
- {
- Vvdw[i] += nbat->out[nb].Vvdw[i];
- Vc[i] += nbat->out[nb].Vc[i];
- }
- }
+ reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
}
}
#else
nbnxn_kernel_x86_simd256_tab_twin_comb_lb_noener,
nbnxn_kernel_x86_simd256_tab_twin_comb_none_noener } };
-#endif /* SSE */
-
static void reduce_group_energies(int ng,int ng_2log,
const real *VSvdw,const real *VSc,
{
int ng_p2,i,j,j0,j1,c,s;
-#ifndef GMX_DOUBLE
-#define SIMD_WIDTH 4
-#define SIMD_WIDTH_2 2
-#else
-#define SIMD_WIDTH 2
-#define SIMD_WIDTH_2 1
-#endif
+#define SIMD_WIDTH (GMX_X86_SIMD_WIDTH_HERE)
+#define SIMD_WIDTH_HALF (GMX_X86_SIMD_WIDTH_HERE/2)
ng_p2 = (1<<ng_2log);
- /* The size of the SSE energy group buffer array is:
- * stride^3*SIMD_WIDTH_2*SIMD_WIDTH
+ /* The size of the x86 SIMD energy group buffer array is:
+ * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
*/
for(i=0; i<ng; i++)
{
{
for(j0=0; j0<ng; j0++)
{
- c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_2*SIMD_WIDTH;
- for(s=0; s<SIMD_WIDTH_2; s++)
+ c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
+ for(s=0; s<SIMD_WIDTH_HALF; s++)
{
Vvdw[i*ng+j0] += VSvdw[c+0];
Vvdw[i*ng+j1] += VSvdw[c+1];
}
}
+#endif /* GMX_X86_AVX_256 */
+
void
nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t *nbl_list,
const nbnxn_atomdata_t *nbat,
if (force_flags & GMX_FORCE_ENERGY)
{
- /* Reduce the energies */
- for(nb=0; nb<nnbl; nb++)
- {
- int i;
-
- for(i=0; i<nbat->out[nb].nV; i++)
- {
- Vvdw[i] += nbat->out[nb].Vvdw[i];
- Vc[i] += nbat->out[nb].Vc[i];
- }
- }
+ reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
}
}
#else
int cj,aj,ajx,ajy,ajz;
#ifdef ENERGY_GROUPS
- int egps_j;
- int egp_jj[UNROLLJ>>1];
- int jj;
+ /* Energy group indices for two atoms packed into one int */
+ int egp_jj[UNROLLJ/2];
#endif
#ifdef CHECK_EXCLS
#ifdef CALC_ENERGIES
#ifdef ENERGY_GROUPS
- /* Extract the group pair index per j pair */
+ /* Extract the group pair index per j pair.
+ * Energy groups are stored per i-cluster, so things get
+ * complicated when the i- and j-cluster size don't match.
+ */
+ {
+ int egps_j;
#if UNROLLJ == 2
- egps_j = nbat->energrp[cj>>1];
- egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
+ egps_j = nbat->energrp[cj>>1];
+ egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
#else
- egps_j = nbat->energrp[cj];
- for(jj=0; jj<(UNROLLJ>>1); jj++)
- {
- egp_jj[jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
- }
+ /* We assume UNROLLI <= UNROLLJ */
+ int jdi;
+ for(jdi=0; jdi<UNROLLJ/UNROLLI; jdi++)
+ {
+ int jj;
+ egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
+ for(jj=0; jj<(UNROLLI/2); jj++)
+ {
+ egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
+ }
+ }
#endif
+ }
#endif
#ifdef CALC_COULOMB
{
int ia,egp_ia;
- for(ia=0; ia<4; ia++)
+ for(ia=0; ia<UNROLLI; ia++)
{
egp_ia = (egps_i >> (ia*egps_ishift)) & egps_imask;
vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
* the rapidly increases number of combinations of energy groups.
* We add to a temporary buffer for 1 i-group vs 2 j-groups.
*/
- for(jj=0; jj<(UNROLLJ>>1); jj++)
+ for(jj=0; jj<(UNROLLJ/2); jj++)
{
gmx_mm_pr v_SSE;