Change
272736bc partially fixed #1388, but broke the more general
case of multiple MPI communication pulses in PME. Change
272736bc
incorrectly changed tx1 and ty1. This change has been reverted.
Change
27189bba fixed the incorrect PME grid reduction with multiple
thread grid overlap in y. But it broke the, much more common, case
where the y-size of the PME grid is not divisible by the domains in y.
This change, incorrectly, changed buf_my.
Now buf_my is set to the correct value, which solves both issues.
Fixes #1578.
Refs #1388 and #1572.
Change-Id: Id2d7d013a3b8cdc04eda1fb026567088a38ec81f
{
/* s2g0 the local interpolation grid start.
* s2g1 the local interpolation grid end.
{
/* s2g0 the local interpolation grid start.
* s2g1 the local interpolation grid end.
- * Because grid overlap communication only goes forward,
- * the grid the slabs for fft's should be rounded down.
+ * Since in calc_pidx we divide particles, and not grid lines,
+ * spatially uniform along dimension x or y, we need to round
+ * s2g0 down and s2g1 up.
*/
ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
*/
ol->s2g0[i] = ( i *ndata + 0 )/nnodes;
ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
int fft_my, fft_mz;
int buf_my = -1;
int nsx, nsy, nsz;
int fft_my, fft_mz;
int buf_my = -1;
int nsx, nsy, nsz;
int offx, offy, offz, x, y, z, i0, i0t;
int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
int offx, offy, offz, x, y, z, i0, i0t;
int sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
for (d = 0; d < DIM; d++)
{
for (d = 0; d < DIM; d++)
{
- ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
- local_fft_ndata[d]);
+ /* Determine up to where our thread needs to copy from the
+ * thread-local charge spreading grid to the rank-local FFT grid.
+ * This is up to our spreading grid end minus order-1 and
+ * not beyond the local FFT grid.
+ */
+ localcopy_end[d] =
+ min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+ local_fft_ndata[d]);
}
offx = pmegrid->offset[XX];
}
offx = pmegrid->offset[XX];
pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
ox += pmegrid_g->offset[XX];
/* Determine the end of our part of the source grid */
pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
ox += pmegrid_g->offset[XX];
/* Determine the end of our part of the source grid */
- tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+ if (!bCommX)
+ {
+ /* Use our thread local source grid and target grid part */
+ tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
+ }
+ else
+ {
+ /* Use our thread local source grid and the spreading range */
+ tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
+ }
for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
{
for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
{
pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
oy += pmegrid_g->offset[YY];
/* Determine the end of our part of the source grid */
pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
oy += pmegrid_g->offset[YY];
/* Determine the end of our part of the source grid */
- ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+ if (!bCommY)
+ {
+ /* Use our thread local source grid and target grid part */
+ ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
+ }
+ else
+ {
+ /* Use our thread local source grid and the spreading range */
+ ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
+ }
for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
{
for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
{
}
pmegrid_g = &pmegrids->grid_th[fz];
oz += pmegrid_g->offset[ZZ];
}
pmegrid_g = &pmegrids->grid_th[fz];
oz += pmegrid_g->offset[ZZ];
- tz1 = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+ tz1 = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
if (sx == 0 && sy == 0 && sz == 0)
{
if (sx == 0 && sy == 0 && sz == 0)
{
if (bCommY)
{
commbuf = commbuf_y;
if (bCommY)
{
commbuf = commbuf_y;
- /* The y-size of the communication buffer is order-1 */
- buf_my = pmegrid->order - 1;
+ /* The y-size of the communication buffer is set by
+ * the overlap of the grid part of our local slab
+ * with the part starting at the next slab.
+ */
+ buf_my =
+ pme->overlap[1].s2g1[pme->nodeid_minor] -
+ pme->overlap[1].s2g0[pme->nodeid_minor+1];
if (bCommX)
{
/* We index commbuf modulo the local grid size */
if (bCommX)
{
/* We index commbuf modulo the local grid size */
sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
recvptr = overlap->recvbuf;
sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
recvptr = overlap->recvbuf;
+ if (debug != NULL)
+ {
+ fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+ local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+ }
+
#ifdef GMX_MPI
MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
send_id, ipulse,
#ifdef GMX_MPI
MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
send_id, ipulse,
- fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+ fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
}
send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
}