fixed issue with -multi and GPUs

author Berk Hess <hess@kth.se>

Thu, 29 Nov 2012 21:44:15 +0000 (22:44 +0100)

committer Berk Hess <hess@kth.se>

Fri, 30 Nov 2012 09:09:57 +0000 (10:09 +0100)
author Berk Hess <hess@kth.se>
Thu, 29 Nov 2012 21:44:15 +0000 (22:44 +0100)
committer Berk Hess <hess@kth.se>
Fri, 30 Nov 2012 09:09:57 +0000 (10:09 +0100)
diff --git a/include/network.h b/include/network.h

index f44a3c700f814bcd55c6748122e85cb3b27bf8d8..35ce9bc5286506d0940d3400ddb8b854dd82e86f 100644 (file)
--- a/include/network.h
+++ b/include/network.h
@@ -76,8 +76,8 @@ void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr);
  /* Sets up fast global communication for clusters with multi-core nodes */
  
  GMX_LIBGMX_EXPORT
-void gmx_init_intra_counters(t_commrec *cr);
-/* Initializes intra-node process counts and ID. */
+void gmx_init_intranode_counters(t_commrec *cr);
+/* Initializes intra-physical-node MPI process/thread counts and ID. */
  
  gmx_bool gmx_mpi_initialized(void);
  /* return TRUE when MPI_Init has been called.
diff --git a/include/types/commrec.h b/include/types/commrec.h

index 4d8a93efa91e9ed3a36d41ca7d8e5d39b5e9bb18..6a1810d7b1c0f73f729b9e66390d3575ff239460 100644 (file)
--- a/include/types/commrec.h
+++ b/include/types/commrec.h
@@ -274,11 +274,11 @@ typedef struct {
    MPI_Comm mpi_comm_mysim;
    MPI_Comm mpi_comm_mygroup;
  
-  /* intra-node stuff */
-  int nodeid_intra;         /* ID over all intra nodes */ 
-  int nodeid_group_intra;   /* ID within my group (separate 0-n IDs for PP/PME-only nodes) */
-  int nnodes_intra;         /* total number of intra nodes */
-  int nnodes_pp_intra;      /* total number of PP intra nodes */
+  /* MPI ranks within a physical node for hardware access */
+  int nrank_intranode;    /* nr of ranks on this physical node */
+  int rank_intranode;     /* our rank on this physical node */
+  int nrank_pp_intranode; /* as nrank_intranode, for particle-particle only */
+  int rank_pp_intranode;  /* as rank_intranode, for particle-particle only */
  
    gmx_nodecomm_t nc;
    
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c

index c9f5fc97aacfdfc6c0d5fb95ce0b3c8e4fe635fb..21e0c649b92a5c6e8c80c0580e68b7c3aea45c1f 100644 (file)
--- a/src/gmxlib/gmx_detect_hardware.c
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -239,7 +239,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
       * => keep on the GPU support, otherwise turn off (or bail if forced)
       * */
      /* number of PP processes per node */
-    npppn = cr->nnodes_pp_intra;
+    npppn = cr->nrank_pp_intranode;
  
      pernode[0] = '\0';
      th_or_proc_plural[0] = '\0';
@@ -301,7 +301,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                                    ShortProgram(), npppn, npppn > 1 ? "s" : "",
                                    bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
  
-                    if (cr->nodeid_intra == 0)
+                    if (cr->rank_pp_intranode == 0)
                      {
                          limit_num_gpus_used(hwinfo, npppn);
                          ngpu = hwinfo->gpu_info.ncuda_dev_use;
@@ -333,7 +333,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                                    th_or_proc, th_or_proc_plural, pernode, gpu_plural,
                                    th_or_proc, npppn, gpu_plural, pernode);
  
-                    if (bMPI || (btMPI && cr->nodeid_intra == 0))
+                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
                      {
                          limit_num_gpus_used(hwinfo, npppn);
                          ngpu = hwinfo->gpu_info.ncuda_dev_use;
@@ -347,7 +347,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                       * level, since the hardware setup and MPI process count
                       * might be differ over physical nodes.
                       */
-                    if (cr->nodeid_intra == 0)
+                    if (cr->rank_pp_intranode == 0)
                      {
                          gmx_fatal(FARGS,
                                    "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
@@ -366,7 +366,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
              }
          }
  
-        if (hwinfo->gpu_info.bUserSet && (cr->nodeid_intra == 0))
+        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
          {
              int i, j, same_count;
              gmx_bool bSomeSame, bAllDifferent;
diff --git a/src/gmxlib/gmx_omp_nthreads.c b/src/gmxlib/gmx_omp_nthreads.c

index 8915ed9a9031638ede8e896e5c9b1d4cd4850161..4cbdfca374fe80dd876bc673c1a23ab1fd021e71 100644 (file)
--- a/src/gmxlib/gmx_omp_nthreads.c
+++ b/src/gmxlib/gmx_omp_nthreads.c
@@ -236,8 +236,8 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
      bOMP = FALSE;
  #endif /* GMX_OPENMP */
  
-    /* number of processes per node */
-    nppn = cr->nnodes_intra;
+    /* number of MPI processes/threads per physical node */
+    nppn = cr->nrank_intranode;
  
      bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
                (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));
@@ -407,7 +407,7 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
  
      /* detect and warn about oversubscription
       * TODO: enable this for separate PME nodes as well! */
-    if (!bSepPME && cr->nodeid_intra == 0)
+    if (!bSepPME && cr->rank_pp_intranode == 0)
      {
          char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
  
diff --git a/src/gmxlib/network.c b/src/gmxlib/network.c

index bf33760ebf91d009b2d8052fdf099312fb8d5a18..71ceb9862264dd888555a2e62b6cd6c7b885f0b1 100644 (file)
--- a/src/gmxlib/network.c
+++ b/src/gmxlib/network.c
@@ -388,60 +388,55 @@ void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
  #endif
  }
  
-void gmx_init_intra_counters(t_commrec *cr)
+void gmx_init_intranode_counters(t_commrec *cr)
  {
-    /* counters for PP+PME and PP-only processes on my node */
-    int nnodes, nnodes_pp, id_mynode=-1, id_mynode_group=-1, nproc_mynode, nproc_mynode_pp;
+    /* counters for PP+PME and PP-only processes on my physical node */
+    int nrank_intranode, rank_intranode;
+    int nrank_pp_intranode, rank_pp_intranode;
+    /* thread-MPI is not initialized when not running in parallel */
  #if defined GMX_MPI && !defined GMX_THREAD_MPI
+    int nrank_world, rank_world;
      int i, mynum, *num, *num_s, *num_pp, *num_pp_s;
-#endif
  
-    nnodes    = cr->nnodes;
-    nnodes_pp = nnodes - cr->npmenodes;
+    MPI_Comm_size(MPI_COMM_WORLD,&nrank_world);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank_world);
  
-#if defined GMX_MPI && !defined GMX_THREAD_MPI
-    /* We have MPI and can expect to have different compute nodes */
+    /* Get the node number from the hostname to identify the nodes */
      mynum = gmx_hostname_num();
  
      /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
-    snew(num,   nnodes);
-    snew(num_s, nnodes);
-    snew(num_pp,   nnodes_pp);
-    snew(num_pp_s, nnodes_pp);
-
-    num_s[cr->sim_nodeid] = mynum;
-    if (cr->duty & DUTY_PP)
-    {
-        num_pp_s[cr->nodeid] = mynum;
-    }
-
-    MPI_Allreduce(num_s, num, nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
-    MPI_Allreduce(num_pp_s, num_pp, nnodes_pp, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
-
-    id_mynode       = 0;
-    id_mynode_group = 0;
-    nproc_mynode    = 0;
-    nproc_mynode_pp = 0;
-    for(i=0; i<nnodes; i++)
+    snew(num,   nrank_world);
+    snew(num_s, nrank_world);
+    snew(num_pp,   nrank_world);
+    snew(num_pp_s, nrank_world);
+
+    num_s[rank_world]    = mynum;
+    num_pp_s[rank_world] = (cr->duty & DUTY_PP) ? mynum : -1;
+
+    MPI_Allreduce(num_s,    num,    nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(num_pp_s, num_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    nrank_intranode    = 0;
+    rank_intranode     = 0;
+    nrank_pp_intranode = 0;
+    rank_pp_intranode  = 0;
+    for(i=0; i<nrank_world; i++)
      {
          if (num[i] == mynum)
          {
-            nproc_mynode++;
-            if (i < cr->sim_nodeid)
+            nrank_intranode++;
+            if (i < rank_world)
              {
-                id_mynode++;
-            }
-            if (i < cr->nodeid)
-            {
-                id_mynode_group++;
+                rank_intranode++;
              }
          }
-    }
-    for(i=0; i<nnodes_pp; i++)
-    {
-        if (num_pp[i] == mynum)
+        if ((cr->duty & DUTY_PP) && num_pp[i] == mynum)
          {
-            nproc_mynode_pp++;
+            nrank_pp_intranode++;
+            if (i < rank_world)
+            {
+                rank_pp_intranode++;
+            }
          }
      }
      sfree(num);
@@ -449,11 +444,11 @@ void gmx_init_intra_counters(t_commrec *cr)
      sfree(num_pp);
      sfree(num_pp_s);
  #else
-    /* Serial or thread-MPI code, we are running within a node */
-    id_mynode       = cr->sim_nodeid;
-    id_mynode_group = cr->nodeid;
-    nproc_mynode    = cr->nnodes;
-    nproc_mynode_pp = cr->nnodes - cr->npmenodes;
+    /* Serial or thread-MPI code: we run within a single physical node */
+    nrank_intranode    = cr->nnodes;
+    rank_intranode     = cr->sim_nodeid;
+    nrank_pp_intranode = cr->nnodes - cr->npmenodes;
+    rank_pp_intranode  = cr->nodeid;
  #endif
  
      if (debug)
@@ -467,15 +462,17 @@ void gmx_init_intra_counters(t_commrec *cr)
          {
              sprintf(sbuf, "%s", cr->duty & DUTY_PP ? "PP" : "PME");
          }
-        fprintf(debug, "On %3s node %d: nodeid_intra=%d, nodeid_group_intra=%d, "
-                "nnodes_intra=%d, nnodes_pp_intra=%d\n", sbuf, cr->sim_nodeid,
-                id_mynode, id_mynode_group, nproc_mynode, nproc_mynode_pp);
+        fprintf(debug, "On %3s node %d: nrank_intranode=%d, rank_intranode=%d, "
+                "nrank_pp_intranode=%d, rank_pp_intranode=%d\n",
+                sbuf, cr->sim_nodeid,
+                nrank_intranode, rank_intranode,
+                nrank_pp_intranode, rank_pp_intranode);
      }
  
-    cr->nodeid_intra        = id_mynode;
-    cr->nodeid_group_intra  = id_mynode_group;
-    cr->nnodes_intra        = nproc_mynode;
-    cr->nnodes_pp_intra     = nproc_mynode_pp;
+    cr->nrank_intranode    = nrank_intranode;
+    cr->rank_intranode     = rank_intranode;
+    cr->nrank_pp_intranode = nrank_pp_intranode;
+    cr->rank_pp_intranode  = rank_pp_intranode;
  }
  
  
diff --git a/src/kernel/runner.c b/src/kernel/runner.c

index 1634cdef86e36a2f7f4e8ab082f0f93cdce3446d..37e9854baa56eefe538113d0fa2179e5c61de49a 100644 (file)
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -927,18 +927,9 @@ static void set_cpu_affinity(FILE *fplog,
              /* We need to determine a scan of the thread counts in this
               * compute node.
               */
-            int process_index;
              MPI_Comm comm_intra;
  
-            process_index = cr->nodeid_intra;
-            if (MULTISIM(cr))
-            {
-                /* To simplify the code, we shift process indices by nnodes.
-                 * There might be far less processes, but that doesn't matter.
-                 */
-                process_index += cr->ms->sim*cr->nnodes;
-            }
-            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),process_index,
+            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
                             &comm_intra);
              MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
              /* MPI_Scan is inclusive, but here we need exclusive */
@@ -1037,8 +1028,8 @@ static void set_cpu_affinity(FILE *fplog,
  
              if (debug)
              {
-                fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
-                        cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
+                fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
+                        cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
              }
          }
  
@@ -1677,8 +1668,8 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
          gmx_setup_nodecomm(fplog,cr);
      }
  
-    /* Initialize per-node process ID and counters. */
-    gmx_init_intra_counters(cr);
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
  
  #ifdef GMX_MPI
      md_print_info(cr,fplog,"Using %d MPI %s\n",
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c

index 6c3665f6d88b73dc8de6f10df2eeae4e215a805c..34d4c963935932a138e3ed45a5cb8573b5682331 100644 (file)
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1506,13 +1506,13 @@ static void pick_nbnxn_kernel(FILE *fp,
          {
              /* Each PP node will use the intra-node id-th device from the
               * list of detected/selected GPUs. */
-            if (!init_gpu(cr->nodeid_group_intra, gpu_err_str, &hwinfo->gpu_info))
+            if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
              {
                  /* At this point the init should never fail as we made sure that
                   * we have all the GPUs we need. If it still does, we'll bail. */
                  gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
                            cr->nodeid,
-                          get_gpu_device_id(&hwinfo->gpu_info, cr->nodeid_group_intra),
+                          get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
                            gpu_err_str);
              }
          }
@@ -1791,7 +1791,7 @@ static void init_nb_verlet(FILE *fp,
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
          nbnxn_cuda_init(fp, &nbv->cu_nbv,
-                        &fr->hwinfo->gpu_info, cr->nodeid_group_intra,
+                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
                          (nbv->ngrp > 1) && !bHybridGPURun);
  
          if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
author	Berk Hess <hess@kth.se>
	Thu, 29 Nov 2012 21:44:15 +0000 (22:44 +0100)
committer	Berk Hess <hess@kth.se>
	Fri, 30 Nov 2012 09:09:57 +0000 (10:09 +0100)
include/network.h		patch \| blob \| history
include/types/commrec.h		patch \| blob \| history
src/gmxlib/gmx_detect_hardware.c		patch \| blob \| history
src/gmxlib/gmx_omp_nthreads.c		patch \| blob \| history
src/gmxlib/network.c		patch \| blob \| history
src/kernel/runner.c		patch \| blob \| history
src/mdlib/forcerec.c		patch \| blob \| history