fixed issue with -multi and GPUs
authorBerk Hess <hess@kth.se>
Thu, 29 Nov 2012 21:44:15 +0000 (22:44 +0100)
committerBerk Hess <hess@kth.se>
Fri, 30 Nov 2012 09:09:57 +0000 (10:09 +0100)
Several GPU related parts of the code did not take the multsim
functionality and separate PME nodes into account.

Change-Id: I4f6471dfda3d14ed7ca33205c56849a27564c742

include/network.h
include/types/commrec.h
src/gmxlib/gmx_detect_hardware.c
src/gmxlib/gmx_omp_nthreads.c
src/gmxlib/network.c
src/kernel/runner.c
src/mdlib/forcerec.c

index f44a3c700f814bcd55c6748122e85cb3b27bf8d8..35ce9bc5286506d0940d3400ddb8b854dd82e86f 100644 (file)
@@ -76,8 +76,8 @@ void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr);
 /* Sets up fast global communication for clusters with multi-core nodes */
 
 GMX_LIBGMX_EXPORT
-void gmx_init_intra_counters(t_commrec *cr);
-/* Initializes intra-node process counts and ID. */
+void gmx_init_intranode_counters(t_commrec *cr);
+/* Initializes intra-physical-node MPI process/thread counts and ID. */
 
 gmx_bool gmx_mpi_initialized(void);
 /* return TRUE when MPI_Init has been called.
index 4d8a93efa91e9ed3a36d41ca7d8e5d39b5e9bb18..6a1810d7b1c0f73f729b9e66390d3575ff239460 100644 (file)
@@ -274,11 +274,11 @@ typedef struct {
   MPI_Comm mpi_comm_mysim;
   MPI_Comm mpi_comm_mygroup;
 
-  /* intra-node stuff */
-  int nodeid_intra;         /* ID over all intra nodes */ 
-  int nodeid_group_intra;   /* ID within my group (separate 0-n IDs for PP/PME-only nodes) */
-  int nnodes_intra;         /* total number of intra nodes */
-  int nnodes_pp_intra;      /* total number of PP intra nodes */
+  /* MPI ranks within a physical node for hardware access */
+  int nrank_intranode;    /* nr of ranks on this physical node */
+  int rank_intranode;     /* our rank on this physical node */
+  int nrank_pp_intranode; /* as nrank_intranode, for particle-particle only */
+  int rank_pp_intranode;  /* as rank_intranode, for particle-particle only */
 
   gmx_nodecomm_t nc;
   
index c9f5fc97aacfdfc6c0d5fb95ce0b3c8e4fe635fb..21e0c649b92a5c6e8c80c0580e68b7c3aea45c1f 100644 (file)
@@ -239,7 +239,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
      * => keep on the GPU support, otherwise turn off (or bail if forced)
      * */
     /* number of PP processes per node */
-    npppn = cr->nnodes_pp_intra;
+    npppn = cr->nrank_pp_intranode;
 
     pernode[0] = '\0';
     th_or_proc_plural[0] = '\0';
@@ -301,7 +301,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                                   ShortProgram(), npppn, npppn > 1 ? "s" : "",
                                   bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
 
-                    if (cr->nodeid_intra == 0)
+                    if (cr->rank_pp_intranode == 0)
                     {
                         limit_num_gpus_used(hwinfo, npppn);
                         ngpu = hwinfo->gpu_info.ncuda_dev_use;
@@ -333,7 +333,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                                   th_or_proc, th_or_proc_plural, pernode, gpu_plural,
                                   th_or_proc, npppn, gpu_plural, pernode);
 
-                    if (bMPI || (btMPI && cr->nodeid_intra == 0))
+                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
                     {
                         limit_num_gpus_used(hwinfo, npppn);
                         ngpu = hwinfo->gpu_info.ncuda_dev_use;
@@ -347,7 +347,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
                      * level, since the hardware setup and MPI process count
                      * might be differ over physical nodes.
                      */
-                    if (cr->nodeid_intra == 0)
+                    if (cr->rank_pp_intranode == 0)
                     {
                         gmx_fatal(FARGS,
                                   "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
@@ -366,7 +366,7 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
             }
         }
 
-        if (hwinfo->gpu_info.bUserSet && (cr->nodeid_intra == 0))
+        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
         {
             int i, j, same_count;
             gmx_bool bSomeSame, bAllDifferent;
index 8915ed9a9031638ede8e896e5c9b1d4cd4850161..4cbdfca374fe80dd876bc673c1a23ab1fd021e71 100644 (file)
@@ -236,8 +236,8 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
     bOMP = FALSE;
 #endif /* GMX_OPENMP */
 
-    /* number of processes per node */
-    nppn = cr->nnodes_intra;
+    /* number of MPI processes/threads per physical node */
+    nppn = cr->nrank_intranode;
 
     bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
               (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));
@@ -407,7 +407,7 @@ void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
 
     /* detect and warn about oversubscription
      * TODO: enable this for separate PME nodes as well! */
-    if (!bSepPME && cr->nodeid_intra == 0)
+    if (!bSepPME && cr->rank_pp_intranode == 0)
     {
         char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
 
index bf33760ebf91d009b2d8052fdf099312fb8d5a18..71ceb9862264dd888555a2e62b6cd6c7b885f0b1 100644 (file)
@@ -388,60 +388,55 @@ void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 #endif
 }
 
-void gmx_init_intra_counters(t_commrec *cr)
+void gmx_init_intranode_counters(t_commrec *cr)
 {
-    /* counters for PP+PME and PP-only processes on my node */
-    int nnodes, nnodes_pp, id_mynode=-1, id_mynode_group=-1, nproc_mynode, nproc_mynode_pp;
+    /* counters for PP+PME and PP-only processes on my physical node */
+    int nrank_intranode, rank_intranode;
+    int nrank_pp_intranode, rank_pp_intranode;
+    /* thread-MPI is not initialized when not running in parallel */
 #if defined GMX_MPI && !defined GMX_THREAD_MPI
+    int nrank_world, rank_world;
     int i, mynum, *num, *num_s, *num_pp, *num_pp_s;
-#endif
 
-    nnodes    = cr->nnodes;
-    nnodes_pp = nnodes - cr->npmenodes;
+    MPI_Comm_size(MPI_COMM_WORLD,&nrank_world);
+    MPI_Comm_rank(MPI_COMM_WORLD,&rank_world);
 
-#if defined GMX_MPI && !defined GMX_THREAD_MPI
-    /* We have MPI and can expect to have different compute nodes */
+    /* Get the node number from the hostname to identify the nodes */
     mynum = gmx_hostname_num();
 
     /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
-    snew(num,   nnodes);
-    snew(num_s, nnodes);
-    snew(num_pp,   nnodes_pp);
-    snew(num_pp_s, nnodes_pp);
-
-    num_s[cr->sim_nodeid] = mynum;
-    if (cr->duty & DUTY_PP)
-    {
-        num_pp_s[cr->nodeid] = mynum;
-    }
-
-    MPI_Allreduce(num_s, num, nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
-    MPI_Allreduce(num_pp_s, num_pp, nnodes_pp, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
-
-    id_mynode       = 0;
-    id_mynode_group = 0;
-    nproc_mynode    = 0;
-    nproc_mynode_pp = 0;
-    for(i=0; i<nnodes; i++)
+    snew(num,   nrank_world);
+    snew(num_s, nrank_world);
+    snew(num_pp,   nrank_world);
+    snew(num_pp_s, nrank_world);
+
+    num_s[rank_world]    = mynum;
+    num_pp_s[rank_world] = (cr->duty & DUTY_PP) ? mynum : -1;
+
+    MPI_Allreduce(num_s,    num,    nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(num_pp_s, num_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    nrank_intranode    = 0;
+    rank_intranode     = 0;
+    nrank_pp_intranode = 0;
+    rank_pp_intranode  = 0;
+    for(i=0; i<nrank_world; i++)
     {
         if (num[i] == mynum)
         {
-            nproc_mynode++;
-            if (i < cr->sim_nodeid)
+            nrank_intranode++;
+            if (i < rank_world)
             {
-                id_mynode++;
-            }
-            if (i < cr->nodeid)
-            {
-                id_mynode_group++;
+                rank_intranode++;
             }
         }
-    }
-    for(i=0; i<nnodes_pp; i++)
-    {
-        if (num_pp[i] == mynum)
+        if ((cr->duty & DUTY_PP) && num_pp[i] == mynum)
         {
-            nproc_mynode_pp++;
+            nrank_pp_intranode++;
+            if (i < rank_world)
+            {
+                rank_pp_intranode++;
+            }
         }
     }
     sfree(num);
@@ -449,11 +444,11 @@ void gmx_init_intra_counters(t_commrec *cr)
     sfree(num_pp);
     sfree(num_pp_s);
 #else
-    /* Serial or thread-MPI code, we are running within a node */
-    id_mynode       = cr->sim_nodeid;
-    id_mynode_group = cr->nodeid;
-    nproc_mynode    = cr->nnodes;
-    nproc_mynode_pp = cr->nnodes - cr->npmenodes;
+    /* Serial or thread-MPI code: we run within a single physical node */
+    nrank_intranode    = cr->nnodes;
+    rank_intranode     = cr->sim_nodeid;
+    nrank_pp_intranode = cr->nnodes - cr->npmenodes;
+    rank_pp_intranode  = cr->nodeid;
 #endif
 
     if (debug)
@@ -467,15 +462,17 @@ void gmx_init_intra_counters(t_commrec *cr)
         {
             sprintf(sbuf, "%s", cr->duty & DUTY_PP ? "PP" : "PME");
         }
-        fprintf(debug, "On %3s node %d: nodeid_intra=%d, nodeid_group_intra=%d, "
-                "nnodes_intra=%d, nnodes_pp_intra=%d\n", sbuf, cr->sim_nodeid,
-                id_mynode, id_mynode_group, nproc_mynode, nproc_mynode_pp);
+        fprintf(debug, "On %3s node %d: nrank_intranode=%d, rank_intranode=%d, "
+                "nrank_pp_intranode=%d, rank_pp_intranode=%d\n",
+                sbuf, cr->sim_nodeid,
+                nrank_intranode, rank_intranode,
+                nrank_pp_intranode, rank_pp_intranode);
     }
 
-    cr->nodeid_intra        = id_mynode;
-    cr->nodeid_group_intra  = id_mynode_group;
-    cr->nnodes_intra        = nproc_mynode;
-    cr->nnodes_pp_intra     = nproc_mynode_pp;
+    cr->nrank_intranode    = nrank_intranode;
+    cr->rank_intranode     = rank_intranode;
+    cr->nrank_pp_intranode = nrank_pp_intranode;
+    cr->rank_pp_intranode  = rank_pp_intranode;
 }
 
 
index 1634cdef86e36a2f7f4e8ab082f0f93cdce3446d..37e9854baa56eefe538113d0fa2179e5c61de49a 100644 (file)
@@ -927,18 +927,9 @@ static void set_cpu_affinity(FILE *fplog,
             /* We need to determine a scan of the thread counts in this
              * compute node.
              */
-            int process_index;
             MPI_Comm comm_intra;
 
-            process_index = cr->nodeid_intra;
-            if (MULTISIM(cr))
-            {
-                /* To simplify the code, we shift process indices by nnodes.
-                 * There might be far less processes, but that doesn't matter.
-                 */
-                process_index += cr->ms->sim*cr->nnodes;
-            }
-            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),process_index,
+            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
                            &comm_intra);
             MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
             /* MPI_Scan is inclusive, but here we need exclusive */
@@ -1037,8 +1028,8 @@ static void set_cpu_affinity(FILE *fplog,
 
             if (debug)
             {
-                fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
-                        cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
+                fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
+                        cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
             }
         }
 
@@ -1677,8 +1668,8 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         gmx_setup_nodecomm(fplog,cr);
     }
 
-    /* Initialize per-node process ID and counters. */
-    gmx_init_intra_counters(cr);
+    /* Initialize per-physical-node MPI process/thread ID and counters. */
+    gmx_init_intranode_counters(cr);
 
 #ifdef GMX_MPI
     md_print_info(cr,fplog,"Using %d MPI %s\n",
index 6c3665f6d88b73dc8de6f10df2eeae4e215a805c..34d4c963935932a138e3ed45a5cb8573b5682331 100644 (file)
@@ -1506,13 +1506,13 @@ static void pick_nbnxn_kernel(FILE *fp,
         {
             /* Each PP node will use the intra-node id-th device from the
              * list of detected/selected GPUs. */
-            if (!init_gpu(cr->nodeid_group_intra, gpu_err_str, &hwinfo->gpu_info))
+            if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
             {
                 /* At this point the init should never fail as we made sure that
                  * we have all the GPUs we need. If it still does, we'll bail. */
                 gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
                           cr->nodeid,
-                          get_gpu_device_id(&hwinfo->gpu_info, cr->nodeid_group_intra),
+                          get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
                           gpu_err_str);
             }
         }
@@ -1791,7 +1791,7 @@ static void init_nb_verlet(FILE *fp,
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
         nbnxn_cuda_init(fp, &nbv->cu_nbv,
-                        &fr->hwinfo->gpu_info, cr->nodeid_group_intra,
+                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
                         (nbv->ngrp > 1) && !bHybridGPURun);
 
         if ((env = getenv("GMX_NB_MIN_CI")) != NULL)