/* Sets up fast global communication for clusters with multi-core nodes */
GMX_LIBGMX_EXPORT
-void gmx_init_intra_counters(t_commrec *cr);
-/* Initializes intra-node process counts and ID. */
+void gmx_init_intranode_counters(t_commrec *cr);
+/* Initializes intra-physical-node MPI process/thread counts and ID. */
gmx_bool gmx_mpi_initialized(void);
/* return TRUE when MPI_Init has been called.
MPI_Comm mpi_comm_mysim;
MPI_Comm mpi_comm_mygroup;
- /* intra-node stuff */
- int nodeid_intra; /* ID over all intra nodes */
- int nodeid_group_intra; /* ID within my group (separate 0-n IDs for PP/PME-only nodes) */
- int nnodes_intra; /* total number of intra nodes */
- int nnodes_pp_intra; /* total number of PP intra nodes */
+ /* MPI ranks within a physical node for hardware access */
+ int nrank_intranode; /* nr of ranks on this physical node */
+ int rank_intranode; /* our rank on this physical node */
+ int nrank_pp_intranode; /* as nrank_intranode, for particle-particle only */
+ int rank_pp_intranode; /* as rank_intranode, for particle-particle only */
gmx_nodecomm_t nc;
* => keep on the GPU support, otherwise turn off (or bail if forced)
* */
/* number of PP processes per node */
- npppn = cr->nnodes_pp_intra;
+ npppn = cr->nrank_pp_intranode;
pernode[0] = '\0';
th_or_proc_plural[0] = '\0';
ShortProgram(), npppn, npppn > 1 ? "s" : "",
bMaxMpiThreadsSet ? "\n Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
- if (cr->nodeid_intra == 0)
+ if (cr->rank_pp_intranode == 0)
{
limit_num_gpus_used(hwinfo, npppn);
ngpu = hwinfo->gpu_info.ncuda_dev_use;
th_or_proc, th_or_proc_plural, pernode, gpu_plural,
th_or_proc, npppn, gpu_plural, pernode);
- if (bMPI || (btMPI && cr->nodeid_intra == 0))
+ if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
{
limit_num_gpus_used(hwinfo, npppn);
ngpu = hwinfo->gpu_info.ncuda_dev_use;
* level, since the hardware setup and MPI process count
* might be differ over physical nodes.
*/
- if (cr->nodeid_intra == 0)
+ if (cr->rank_pp_intranode == 0)
{
gmx_fatal(FARGS,
"Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
}
}
- if (hwinfo->gpu_info.bUserSet && (cr->nodeid_intra == 0))
+ if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
{
int i, j, same_count;
gmx_bool bSomeSame, bAllDifferent;
bOMP = FALSE;
#endif /* GMX_OPENMP */
- /* number of processes per node */
- nppn = cr->nnodes_intra;
+ /* number of MPI processes/threads per physical node */
+ nppn = cr->nrank_intranode;
bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
(!(cr->duty & DUTY_PP) && (cr->duty & DUTY_PME));
/* detect and warn about oversubscription
* TODO: enable this for separate PME nodes as well! */
- if (!bSepPME && cr->nodeid_intra == 0)
+ if (!bSepPME && cr->rank_pp_intranode == 0)
{
char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
#endif
}
-void gmx_init_intra_counters(t_commrec *cr)
+void gmx_init_intranode_counters(t_commrec *cr)
{
- /* counters for PP+PME and PP-only processes on my node */
- int nnodes, nnodes_pp, id_mynode=-1, id_mynode_group=-1, nproc_mynode, nproc_mynode_pp;
+ /* counters for PP+PME and PP-only processes on my physical node */
+ int nrank_intranode, rank_intranode;
+ int nrank_pp_intranode, rank_pp_intranode;
+ /* thread-MPI is not initialized when not running in parallel */
#if defined GMX_MPI && !defined GMX_THREAD_MPI
+ int nrank_world, rank_world;
int i, mynum, *num, *num_s, *num_pp, *num_pp_s;
-#endif
- nnodes = cr->nnodes;
- nnodes_pp = nnodes - cr->npmenodes;
+ MPI_Comm_size(MPI_COMM_WORLD,&nrank_world);
+ MPI_Comm_rank(MPI_COMM_WORLD,&rank_world);
-#if defined GMX_MPI && !defined GMX_THREAD_MPI
- /* We have MPI and can expect to have different compute nodes */
+ /* Get the node number from the hostname to identify the nodes */
mynum = gmx_hostname_num();
/* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
- snew(num, nnodes);
- snew(num_s, nnodes);
- snew(num_pp, nnodes_pp);
- snew(num_pp_s, nnodes_pp);
-
- num_s[cr->sim_nodeid] = mynum;
- if (cr->duty & DUTY_PP)
- {
- num_pp_s[cr->nodeid] = mynum;
- }
-
- MPI_Allreduce(num_s, num, nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
- MPI_Allreduce(num_pp_s, num_pp, nnodes_pp, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
-
- id_mynode = 0;
- id_mynode_group = 0;
- nproc_mynode = 0;
- nproc_mynode_pp = 0;
- for(i=0; i<nnodes; i++)
+ snew(num, nrank_world);
+ snew(num_s, nrank_world);
+ snew(num_pp, nrank_world);
+ snew(num_pp_s, nrank_world);
+
+ num_s[rank_world] = mynum;
+ num_pp_s[rank_world] = (cr->duty & DUTY_PP) ? mynum : -1;
+
+ MPI_Allreduce(num_s, num, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+ MPI_Allreduce(num_pp_s, num_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+ nrank_intranode = 0;
+ rank_intranode = 0;
+ nrank_pp_intranode = 0;
+ rank_pp_intranode = 0;
+ for(i=0; i<nrank_world; i++)
{
if (num[i] == mynum)
{
- nproc_mynode++;
- if (i < cr->sim_nodeid)
+ nrank_intranode++;
+ if (i < rank_world)
{
- id_mynode++;
- }
- if (i < cr->nodeid)
- {
- id_mynode_group++;
+ rank_intranode++;
}
}
- }
- for(i=0; i<nnodes_pp; i++)
- {
- if (num_pp[i] == mynum)
+ if ((cr->duty & DUTY_PP) && num_pp[i] == mynum)
{
- nproc_mynode_pp++;
+ nrank_pp_intranode++;
+ if (i < rank_world)
+ {
+ rank_pp_intranode++;
+ }
}
}
sfree(num);
sfree(num_pp);
sfree(num_pp_s);
#else
- /* Serial or thread-MPI code, we are running within a node */
- id_mynode = cr->sim_nodeid;
- id_mynode_group = cr->nodeid;
- nproc_mynode = cr->nnodes;
- nproc_mynode_pp = cr->nnodes - cr->npmenodes;
+ /* Serial or thread-MPI code: we run within a single physical node */
+ nrank_intranode = cr->nnodes;
+ rank_intranode = cr->sim_nodeid;
+ nrank_pp_intranode = cr->nnodes - cr->npmenodes;
+ rank_pp_intranode = cr->nodeid;
#endif
if (debug)
{
sprintf(sbuf, "%s", cr->duty & DUTY_PP ? "PP" : "PME");
}
- fprintf(debug, "On %3s node %d: nodeid_intra=%d, nodeid_group_intra=%d, "
- "nnodes_intra=%d, nnodes_pp_intra=%d\n", sbuf, cr->sim_nodeid,
- id_mynode, id_mynode_group, nproc_mynode, nproc_mynode_pp);
+ fprintf(debug, "On %3s node %d: nrank_intranode=%d, rank_intranode=%d, "
+ "nrank_pp_intranode=%d, rank_pp_intranode=%d\n",
+ sbuf, cr->sim_nodeid,
+ nrank_intranode, rank_intranode,
+ nrank_pp_intranode, rank_pp_intranode);
}
- cr->nodeid_intra = id_mynode;
- cr->nodeid_group_intra = id_mynode_group;
- cr->nnodes_intra = nproc_mynode;
- cr->nnodes_pp_intra = nproc_mynode_pp;
+ cr->nrank_intranode = nrank_intranode;
+ cr->rank_intranode = rank_intranode;
+ cr->nrank_pp_intranode = nrank_pp_intranode;
+ cr->rank_pp_intranode = rank_pp_intranode;
}
/* We need to determine a scan of the thread counts in this
* compute node.
*/
- int process_index;
MPI_Comm comm_intra;
- process_index = cr->nodeid_intra;
- if (MULTISIM(cr))
- {
- /* To simplify the code, we shift process indices by nnodes.
- * There might be far less processes, but that doesn't matter.
- */
- process_index += cr->ms->sim*cr->nnodes;
- }
- MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),process_index,
+ MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
&comm_intra);
MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
/* MPI_Scan is inclusive, but here we need exclusive */
if (debug)
{
- fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
- cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
+ fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
+ cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
}
}
gmx_setup_nodecomm(fplog,cr);
}
- /* Initialize per-node process ID and counters. */
- gmx_init_intra_counters(cr);
+ /* Initialize per-physical-node MPI process/thread ID and counters. */
+ gmx_init_intranode_counters(cr);
#ifdef GMX_MPI
md_print_info(cr,fplog,"Using %d MPI %s\n",
{
/* Each PP node will use the intra-node id-th device from the
* list of detected/selected GPUs. */
- if (!init_gpu(cr->nodeid_group_intra, gpu_err_str, &hwinfo->gpu_info))
+ if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
{
/* At this point the init should never fail as we made sure that
* we have all the GPUs we need. If it still does, we'll bail. */
gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
cr->nodeid,
- get_gpu_device_id(&hwinfo->gpu_info, cr->nodeid_group_intra),
+ get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
gpu_err_str);
}
}
/* init the NxN GPU data; the last argument tells whether we'll have
* both local and non-local NB calculation on GPU */
nbnxn_cuda_init(fp, &nbv->cu_nbv,
- &fr->hwinfo->gpu_info, cr->nodeid_group_intra,
+ &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
(nbv->ngrp > 1) && !bHybridGPURun);
if ((env = getenv("GMX_NB_MIN_CI")) != NULL)