#include "pull.h"
+#include "config.h"
+
#include <assert.h>
#include <math.h>
#include <stdio.h>
t_commrec *cr, double t, real lambda,
rvec *x, rvec *f, tensor vir, real *dvdlambda)
{
- real V, dVdl;
+ real V = 0, dVdl;
assert(pull != NULL);
- pull_calc_coms(cr, pull, md, pbc, t, x, NULL);
+ if (pull->comm.bParticipate)
+ {
+ pull_calc_coms(cr, pull, md, pbc, t, x, NULL);
- do_pull_pot(pull, pbc, t, lambda,
- &V, MASTER(cr) ? vir : NULL, &dVdl);
+ do_pull_pot(pull, pbc, t, lambda,
+ &V, MASTER(cr) ? vir : NULL, &dVdl);
- /* Distribute forces over pulled groups */
- apply_forces(pull, md, f);
+ /* Distribute forces over the pull groups */
+ apply_forces(pull, md, f);
- if (MASTER(cr))
- {
- *dvdlambda += dVdl;
+ if (MASTER(cr))
+ {
+ *dvdlambda += dVdl;
+ }
}
return (MASTER(cr) ? V : 0.0);
{
assert(pull != NULL);
- pull_calc_coms(cr, pull, md, pbc, t, x, xp);
+ if (pull->comm.bParticipate)
+ {
+ pull_calc_coms(cr, pull, md, pbc, t, x, xp);
- do_constraint(pull, pbc, xp, v, MASTER(cr), vir, dt, t);
+ do_constraint(pull, pbc, xp, v, MASTER(cr), vir, dt, t);
+ }
}
static void make_local_pull_group(gmx_ga2la_t ga2la,
}
}
-void dd_make_local_pull_groups(gmx_domdec_t *dd, struct pull_t *pull, t_mdatoms *md)
+void dd_make_local_pull_groups(t_commrec *cr, struct pull_t *pull, t_mdatoms *md)
{
- gmx_ga2la_t ga2la;
- int g;
+ gmx_domdec_t *dd;
+ pull_comm_t *comm;
+ gmx_ga2la_t ga2la;
+ gmx_bool bMustParticipate;
+ int g;
+
+ dd = cr->dd;
+
+ comm = &pull->comm;
if (dd)
{
ga2la = NULL;
}
+ /* We always make the master node participate, such that it can do i/o
+ * and to simplify MC type extensions people might have.
+ */
+ bMustParticipate = (comm->bParticipateAll || dd == NULL || DDMASTER(dd));
+
for (g = 0; g < pull->ngroup; g++)
{
+ int a;
+
make_local_pull_group(ga2la, &pull->group[g],
0, md->homenr);
+
+ /* We should participate if we have pull or pbc atoms */
+ if (!bMustParticipate &&
+ (pull->group[g].nat_loc > 0 ||
+ (pull->group[g].params.pbcatom >= 0 &&
+ ga2la_get_home(dd->ga2la, pull->group[g].params.pbcatom, &a))))
+ {
+ bMustParticipate = TRUE;
+ }
+ }
+
+ if (!comm->bParticipateAll)
+ {
+ /* Keep currently not required ranks in the communicator
+ * if they needed to participate up to 20 decompositions ago.
+ * This avoids frequent rebuilds due to atoms jumping back and forth.
+ */
+ const gmx_int64_t history_count = 20;
+ gmx_bool bWillParticipate;
+ int count[2];
+
+ /* Increase the decomposition counter for the current call */
+ comm->setup_count++;
+
+ if (bMustParticipate)
+ {
+ comm->must_count = comm->setup_count;
+ }
+
+ bWillParticipate =
+ bMustParticipate ||
+ (comm->bParticipate &&
+ comm->must_count >= comm->setup_count - history_count);
+
+ if (debug && dd != NULL)
+ {
+ fprintf(debug, "Our DD rank (%3d) pull #atoms>0 or master: %d, will be part %d\n",
+ dd->rank, bMustParticipate, bWillParticipate);
+ }
+
+ if (bWillParticipate)
+ {
+ /* Count the number of ranks that we want to have participating */
+ count[0] = 1;
+ /* Count the number of ranks that need to be added */
+ count[1] = comm->bParticipate ? 0 : 1;
+ }
+ else
+ {
+ count[0] = 0;
+ count[1] = 0;
+ }
+
+ /* The cost of this global operation will be less that the cost
+ * of the extra MPI_Comm_split calls that we can avoid.
+ */
+ gmx_sumi(2, count, cr);
+
+ /* If we are missing ranks or if we have 20% more ranks than needed
+ * we make a new sub-communicator.
+ */
+ if (count[1] > 0 || 6*count[0] < 5*comm->nparticipate)
+ {
+ if (debug)
+ {
+ fprintf(debug, "Creating new pull subcommunicator of size %d\n",
+ count[0]);
+ }
+
+#ifdef GMX_MPI
+ if (comm->mpi_comm_com != MPI_COMM_NULL)
+ {
+ MPI_Comm_free(&comm->mpi_comm_com);
+ }
+
+ /* This might be an extremely expensive operation, so we try
+ * to avoid this splitting as much as possible.
+ */
+ assert(dd != NULL);
+ MPI_Comm_split(dd->mpi_comm_all, bWillParticipate ? 0 : 1, dd->rank,
+ &comm->mpi_comm_com);
+#endif
+
+ comm->bParticipate = bWillParticipate;
+ comm->nparticipate = count[0];
+ }
}
/* Since the PBC of atoms might have changed, we need to update the PBC */
gmx_bool bOutFile, unsigned long Flags)
{
struct pull_t *pull;
+ pull_comm_t *comm;
int g, c, m;
snew(pull, 1);
}
}
- pull->rbuf = NULL;
- pull->dbuf = NULL;
- pull->dbuf_cyl = NULL;
pull->bRefAt = FALSE;
pull->cosdim = -1;
for (g = 0; g < pull->ngroup; g++)
}
}
+ comm = &pull->comm;
+
+#ifdef GMX_MPI
+ /* Use a sub-communicator when we have more than 32 ranks */
+ comm->bParticipateAll = (cr == NULL || !DOMAINDECOMP(cr) ||
+ cr->dd->nnodes <= 32 ||
+ getenv("GMX_PULL_PARTICIPATE_ALL") != NULL);
+ /* This sub-commicator is not used with comm->bParticipateAll,
+ * so we can always initialize it to NULL.
+ */
+ comm->mpi_comm_com = MPI_COMM_NULL;
+ comm->nparticipate = 0;
+#else
+ /* No MPI: 1 rank: all ranks pull */
+ comm->bParticipateAll = TRUE;
+#endif
+ comm->bParticipate = comm->bParticipateAll;
+ comm->setup_count = 0;
+ comm->must_count = 0;
+
+ if (!comm->bParticipateAll && fplog != NULL)
+ {
+ fprintf(fplog, "Will use a sub-communicator for pull communication\n");
+ }
+
+ comm->rbuf = NULL;
+ comm->dbuf = NULL;
+ comm->dbuf_cyl = NULL;
+
/* We still need to initialize the PBC reference coordinates */
pull->bSetPBCatoms = TRUE;
sfree(pull->dyna);
sfree(pull->coord);
- sfree(pull->rbuf);
- sfree(pull->dbuf);
- sfree(pull->dbuf_cyl);
+#ifdef GMX_MPI
+ if (pull->comm.mpi_comm_com != MPI_COMM_NULL)
+ {
+ MPI_Comm_free(&pull->comm.mpi_comm_com);
+ }
+#endif
+ sfree(pull->comm.rbuf);
+ sfree(pull->comm.dbuf);
+ sfree(pull->comm.dbuf_cyl);
sfree(pull);
}
*/
#include "gmxpre.h"
+#include "config.h"
+
#include <assert.h>
#include <stdlib.h>
#include "gromacs/pulling/pull_internal.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/futil.h"
+#include "gromacs/utility/real.h"
#include "gromacs/utility/smalloc.h"
+static void pull_reduce_real(t_commrec *cr,
+ pull_comm_t *comm,
+ int n,
+ real *data)
+{
+ if (cr != NULL && PAR(cr))
+ {
+ if (comm->bParticipateAll)
+ {
+ /* Sum the contributions over all DD ranks */
+ gmx_sum(n, data, cr);
+ }
+ else
+ {
+#ifdef GMX_MPI
+#ifdef MPI_IN_PLACE_EXISTS
+ MPI_Allreduce(MPI_IN_PLACE, data, n, GMX_MPI_REAL, MPI_SUM,
+ comm->mpi_comm_com);
+#else
+ real *buf;
+
+ snew(buf, n);
+
+ MPI_Allreduce(data, buf, n, GMX_MPI_REAL, MPI_SUM,
+ comm->mpi_comm_com);
+
+ /* Copy the result from the buffer to the input/output data */
+ for (i = 0; i < nr; i++)
+ {
+ data[i] = buf[i];
+ }
+ sfree(buf);
+#endif
+#else
+ gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
+#endif
+ }
+ }
+}
+
+static void pull_reduce_double(t_commrec *cr,
+ pull_comm_t *comm,
+ int n,
+ double *data)
+{
+ if (cr != NULL && PAR(cr))
+ {
+ if (comm->bParticipateAll)
+ {
+ /* Sum the contributions over all DD ranks */
+ gmx_sumd(n, data, cr);
+ }
+ else
+ {
+#ifdef GMX_MPI
+#ifdef MPI_IN_PLACE_EXISTS
+ MPI_Allreduce(MPI_IN_PLACE, data, n, MPI_DOUBLE, MPI_SUM,
+ comm->mpi_comm_com);
+#else
+ double *buf;
+
+ snew(buf, n);
+
+ MPI_Allreduce(data, buf, n, MPI_DOUBLE, MPI_SUM,
+ comm->mpi_comm_com);
+
+ /* Copy the result from the buffer to the input/output data */
+ for (i = 0; i < nr; i++)
+ {
+ data[i] = buf[i];
+ }
+ sfree(buf);
+#endif
+#else
+ gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
+#endif
+ }
+ }
+}
+
static void pull_set_pbcatom(t_commrec *cr, pull_group_work_t *pgrp,
rvec *x,
rvec x_pbc)
if (cr && PAR(cr) && n > 0)
{
- /* Sum over the nodes to get x_pbc from the home node of pbcatom */
- gmx_sum(pull->ngroup*DIM, x_pbc[0], cr);
+ /* Sum over participating ranks to get x_pbc from the home ranks.
+ * This can be very expensive at high parallelization, so we only
+ * do this after each DD repartitioning.
+ */
+ pull_reduce_real(cr, &pull->comm, pull->ngroup*DIM, x_pbc[0]);
}
}
int c, i, ii, m, start, end;
rvec g_x, dx, dir;
double inv_cyl_r2;
+ pull_comm_t *comm;
gmx_ga2la_t ga2la = NULL;
- if (pull->dbuf_cyl == NULL)
+ comm = &pull->comm;
+
+ if (comm->dbuf_cyl == NULL)
{
- snew(pull->dbuf_cyl, pull->ncoord*stride);
+ snew(comm->dbuf_cyl, pull->ncoord*stride);
}
if (cr && DOMAINDECOMP(cr))
}
}
}
- pull->dbuf_cyl[c*stride+0] = wmass;
- pull->dbuf_cyl[c*stride+1] = wwmass;
- pull->dbuf_cyl[c*stride+2] = sum_a;
- pull->dbuf_cyl[c*stride+3] = radf_fac0[XX];
- pull->dbuf_cyl[c*stride+4] = radf_fac0[YY];
- pull->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
- pull->dbuf_cyl[c*stride+6] = radf_fac1[XX];
- pull->dbuf_cyl[c*stride+7] = radf_fac1[YY];
- pull->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];
+ comm->dbuf_cyl[c*stride+0] = wmass;
+ comm->dbuf_cyl[c*stride+1] = wwmass;
+ comm->dbuf_cyl[c*stride+2] = sum_a;
+ comm->dbuf_cyl[c*stride+3] = radf_fac0[XX];
+ comm->dbuf_cyl[c*stride+4] = radf_fac0[YY];
+ comm->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
+ comm->dbuf_cyl[c*stride+6] = radf_fac1[XX];
+ comm->dbuf_cyl[c*stride+7] = radf_fac1[YY];
+ comm->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];
}
if (cr != NULL && PAR(cr))
{
/* Sum the contributions over the ranks */
- gmx_sumd(pull->ncoord*stride, pull->dbuf_cyl, cr);
+ pull_reduce_double(cr, comm, pull->ncoord*stride, comm->dbuf_cyl);
}
for (c = 0; c < pull->ncoord; c++)
pdyna = &pull->dyna[c];
pgrp = &pull->group[pcrd->params.group[1]];
- wmass = pull->dbuf_cyl[c*stride+0];
- wwmass = pull->dbuf_cyl[c*stride+1];
+ wmass = comm->dbuf_cyl[c*stride+0];
+ wwmass = comm->dbuf_cyl[c*stride+1];
pdyna->mwscale = 1.0/wmass;
/* Cylinder pulling can't be used with constraints, but we set
* wscale and invtm anyhow, in case someone would like to use them.
for (m = 0; m < DIM; m++)
{
g_x[m] = pgrp->x[m] - pcrd->vec[m]*pcrd->value_ref;
- dist = -pcrd->vec[m]*pull->dbuf_cyl[c*stride+2]*pdyna->mwscale;
+ dist = -pcrd->vec[m]*comm->dbuf_cyl[c*stride+2]*pdyna->mwscale;
pdyna->x[m] = g_x[m] - dist;
pcrd->cyl_dev += dist;
}
*/
for (m = 0; m < DIM; m++)
{
- pcrd->ffrad[m] = (pull->dbuf_cyl[c*stride+6+m] +
- pull->dbuf_cyl[c*stride+3+m]*pcrd->cyl_dev)/wmass;
+ pcrd->ffrad[m] = (comm->dbuf_cyl[c*stride+6+m] +
+ comm->dbuf_cyl[c*stride+3+m]*pcrd->cyl_dev)/wmass;
}
if (debug)
struct pull_t *pull, t_mdatoms *md, t_pbc *pbc, double t,
rvec x[], rvec *xp)
{
- int g;
- real twopi_box = 0;
+ int g;
+ real twopi_box = 0;
+ pull_comm_t *comm;
+
+ comm = &pull->comm;
- if (pull->rbuf == NULL)
+ if (comm->rbuf == NULL)
{
- snew(pull->rbuf, pull->ngroup);
+ snew(comm->rbuf, pull->ngroup);
}
- if (pull->dbuf == NULL)
+ if (comm->dbuf == NULL)
{
- snew(pull->dbuf, 3*pull->ngroup);
+ snew(comm->dbuf, 3*pull->ngroup);
}
if (pull->bRefAt && pull->bSetPBCatoms)
{
- pull_set_pbcatoms(cr, pull, x, pull->rbuf);
+ pull_set_pbcatoms(cr, pull, x, comm->rbuf);
if (cr != NULL && DOMAINDECOMP(cr))
{
if (pgrp->epgrppbc == epgrppbcREFAT)
{
/* Set the pbc atom */
- copy_rvec(pull->rbuf[g], x_pbc);
+ copy_rvec(comm->rbuf[g], x_pbc);
}
for (i = 0; i < pgrp->nat_loc; i++)
}
/* Copy local sums to a buffer for global summing */
- copy_dvec(com, pull->dbuf[g*3]);
- copy_dvec(comp, pull->dbuf[g*3+1]);
- pull->dbuf[g*3+2][0] = wmass;
- pull->dbuf[g*3+2][1] = wwmass;
- pull->dbuf[g*3+2][2] = 0;
+ copy_dvec(com, comm->dbuf[g*3]);
+ copy_dvec(comp, comm->dbuf[g*3 + 1]);
+ comm->dbuf[g*3 + 2][0] = wmass;
+ comm->dbuf[g*3 + 2][1] = wwmass;
+ comm->dbuf[g*3 + 2][2] = 0;
}
else
{
}
/* Copy local sums to a buffer for global summing */
- pull->dbuf[g*3 ][0] = cm;
- pull->dbuf[g*3 ][1] = sm;
- pull->dbuf[g*3 ][2] = 0;
- pull->dbuf[g*3+1][0] = ccm;
- pull->dbuf[g*3+1][1] = csm;
- pull->dbuf[g*3+1][2] = ssm;
- pull->dbuf[g*3+2][0] = cmp;
- pull->dbuf[g*3+2][1] = smp;
- pull->dbuf[g*3+2][2] = 0;
+ comm->dbuf[g*3 ][0] = cm;
+ comm->dbuf[g*3 ][1] = sm;
+ comm->dbuf[g*3 ][2] = 0;
+ comm->dbuf[g*3+1][0] = ccm;
+ comm->dbuf[g*3+1][1] = csm;
+ comm->dbuf[g*3+1][2] = ssm;
+ comm->dbuf[g*3+2][0] = cmp;
+ comm->dbuf[g*3+2][1] = smp;
+ comm->dbuf[g*3+2][2] = 0;
}
}
}
- if (cr && PAR(cr))
- {
- /* Sum the contributions over the nodes */
- gmx_sumd(pull->ngroup*3*DIM, pull->dbuf[0], cr);
- }
+ pull_reduce_double(cr, comm, pull->ngroup*3*DIM, comm->dbuf[0]);
for (g = 0; g < pull->ngroup; g++)
{
int m;
/* Determine the inverse mass */
- wmass = pull->dbuf[g*3+2][0];
- wwmass = pull->dbuf[g*3+2][1];
+ wmass = comm->dbuf[g*3+2][0];
+ wwmass = comm->dbuf[g*3+2][1];
pgrp->mwscale = 1.0/wmass;
/* invtm==0 signals a frozen group, so then we should keep it zero */
if (pgrp->invtm != 0)
/* Divide by the total mass */
for (m = 0; m < DIM; m++)
{
- pgrp->x[m] = pull->dbuf[g*3 ][m]*pgrp->mwscale;
+ pgrp->x[m] = comm->dbuf[g*3 ][m]*pgrp->mwscale;
if (xp)
{
- pgrp->xp[m] = pull->dbuf[g*3+1][m]*pgrp->mwscale;
+ pgrp->xp[m] = comm->dbuf[g*3+1][m]*pgrp->mwscale;
}
if (pgrp->epgrppbc == epgrppbcREFAT)
{
- pgrp->x[m] += pull->rbuf[g][m];
+ pgrp->x[m] += comm->rbuf[g][m];
if (xp)
{
- pgrp->xp[m] += pull->rbuf[g][m];
+ pgrp->xp[m] += comm->rbuf[g][m];
}
}
}
int i, ii;
/* Determine the optimal location of the cosine weight */
- csw = pull->dbuf[g*3][0];
- snw = pull->dbuf[g*3][1];
+ csw = comm->dbuf[g*3][0];
+ snw = comm->dbuf[g*3][1];
pgrp->x[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
/* Set the weights for the local atoms */
wmass = sqrt(csw*csw + snw*snw);
- wwmass = (pull->dbuf[g*3+1][0]*csw*csw +
- pull->dbuf[g*3+1][1]*csw*snw +
- pull->dbuf[g*3+1][2]*snw*snw)/(wmass*wmass);
+ wwmass = (comm->dbuf[g*3+1][0]*csw*csw +
+ comm->dbuf[g*3+1][1]*csw*snw +
+ comm->dbuf[g*3+1][2]*snw*snw)/(wmass*wmass);
pgrp->mwscale = 1.0/wmass;
pgrp->wscale = wmass/wwmass;
}
if (xp)
{
- csw = pull->dbuf[g*3+2][0];
- snw = pull->dbuf[g*3+2][1];
+ csw = comm->dbuf[g*3+2][0];
+ snw = comm->dbuf[g*3+2][1];
pgrp->xp[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
}
}