From: Berk Hess <hess@kth.se>
Date: Wed, 29 Apr 2015 10:13:00 +0000 (+0200)
Subject: Hardware reporting now covers all nodes
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=12867f75c8a3ef04f72b028d8be81a0864535820;p=alexxy%2Fgromacs.git

Hardware reporting now covers all nodes

The CPU and GPU hardware reporting and checking were only printed
for the node(s) of the master rank(s) of each simulation.
Now the most important CPU and GPU information is reduced over
MPI_COMM_WORLD. These results are printed and checks/warnings are
now based on this.
Because this is now system wide, the printing is moved up to directly
after the detection and GPU info is printed together with CPU info.
The Gromacs reference printing is moved to after hardware printing.

Refs #1643.

Change-Id: I974871c9a293a922dd1db9dcf633d485f62996ae
---

diff --git a/src/gromacs/gmxlib/gmx_cpuid.c b/src/gromacs/gmxlib/gmx_cpuid.c
index 9791608f4b..8e496ea5e6 100644
--- a/src/gromacs/gmxlib/gmx_cpuid.c
+++ b/src/gromacs/gmxlib/gmx_cpuid.c
@@ -289,6 +289,13 @@ static const enum gmx_cpuid_simd compiled_simd = GMX_CPUID_SIMD_NONE;
 #endif
 
 
+enum gmx_cpuid_simd
+gmx_compiled_simd()
+{
+    return compiled_simd;
+}
+
+
 #ifdef GMX_CPUID_X86
 
 /* Execute CPUID on x86 class CPUs. level sets function to exec, and the
@@ -1174,19 +1181,19 @@ gmx_cpuid_formatstring       (gmx_cpuid_t              cpuid,
 
 #ifdef _MSC_VER
     _snprintf(str, n,
-              "Vendor: %s\n"
-              "Brand:  %s\n"
-              "Family: %2d  Model: %2d  Stepping: %2d\n"
-              "Features:",
+              "    Vendor: %s\n"
+              "    Brand:  %s\n"
+              "    Family: %2d  model: %2d  stepping: %2d\n"
+              "    CPU features:",
               gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
               gmx_cpuid_brand(cpuid),
               gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 #else
     snprintf(str, n,
-             "Vendor: %s\n"
-             "Brand:  %s\n"
-             "Family: %2d  Model: %2d  Stepping: %2d\n"
-             "Features:",
+             "    Vendor: %s\n"
+             "    Brand:  %s\n"
+             "    Family: %2d  model: %2d  stepping: %2d\n"
+             "    CPU features:",
              gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
              gmx_cpuid_brand(cpuid),
              gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
@@ -1307,34 +1314,14 @@ gmx_cpuid_simd_suggest  (gmx_cpuid_t                 cpuid)
 }
 
 
-
 int
-gmx_cpuid_simd_check(gmx_cpuid_t   cpuid,
-                     FILE *        log,
-                     int           print_to_stderr)
+gmx_cpuid_simd_check(enum gmx_cpuid_simd  simd_suggest,
+                     FILE *               log,
+                     int                  print_to_stderr)
 {
-    int                           rc;
-    char                          str[1024];
-    enum gmx_cpuid_simd           simd;
-
-    simd = gmx_cpuid_simd_suggest(cpuid);
-
-    rc = (simd != compiled_simd);
+    int  rc;
 
-    gmx_cpuid_formatstring(cpuid, str, 1023);
-    str[1023] = '\0';
-
-    if (log != NULL)
-    {
-        fprintf(log,
-                "\nDetecting CPU SIMD instructions.\nPresent hardware specification:\n"
-                "%s"
-                "SIMD instructions most likely to fit this hardware: %s\n"
-                "SIMD instructions selected at GROMACS compile time: %s\n\n",
-                str,
-                gmx_cpuid_simd_string[simd],
-                gmx_cpuid_simd_string[compiled_simd]);
-    }
+    rc = (simd_suggest != compiled_simd);
 
     if (rc != 0)
     {
@@ -1343,14 +1330,14 @@ gmx_cpuid_simd_check(gmx_cpuid_t   cpuid,
             fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
                     "SIMD instructions most likely to fit this hardware: %s\n"
                     "SIMD instructions selected at GROMACS compile time: %s\n\n",
-                    gmx_cpuid_simd_string[simd],
+                    gmx_cpuid_simd_string[simd_suggest],
                     gmx_cpuid_simd_string[compiled_simd]);
         }
         if (print_to_stderr)
         {
             fprintf(stderr, "Compiled SIMD instructions: %s (GROMACS could use %s on this machine, which is better)\n",
                     gmx_cpuid_simd_string[compiled_simd],
-                    gmx_cpuid_simd_string[simd]);
+                    gmx_cpuid_simd_string[simd_suggest]);
         }
     }
     return rc;
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
index faa5678e8d..a851dbee5d 100644
--- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
@@ -109,7 +109,7 @@ static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
     for (i = 0; i < ndev; i++)
     {
         get_gpu_device_info_string(stmp, gpu_info, i);
-        strcat(sbuf, "  ");
+        strcat(sbuf, "    ");
         strcat(sbuf, stmp);
         if (i < ndev - 1)
         {
@@ -294,17 +294,6 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
     bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
     bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
 
-    /* check the SIMD level mdrun is compiled with against hardware
-       capabilities */
-    /* TODO: Here we assume homogeneous hardware which is not necessarily
-             the case! Might not hurt to add an extra check over MPI. */
-    gmx_cpuid_simd_check(hwinfo->cpuid_info, fplog, SIMMASTER(cr));
-
-    check_use_of_rdtscp_on_this_cpu(fplog, cr, hwinfo);
-
-    /* NOTE: this print is only for and on one physical node */
-    print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
-
     if (hwinfo->gpu_info.n_dev_compatible > 0)
     {
         std::string gpuUseageReport;
@@ -317,7 +306,7 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 
         /* NOTE: this print is only for and on one physical node */
-        md_print_info(cr, fplog, gpuUseageReport.c_str());
+        md_print_info(cr, fplog, "%s\n", gpuUseageReport.c_str());
     }
 
     /* Need to ensure that we have enough GPUs:
@@ -532,6 +521,28 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
     return uniq_count;
 }
 
+static int get_ncores(gmx_cpuid_t cpuid)
+{
+    int        nprocessors, npackages, ncores_per_package, nhwthreads_per_core;
+    const int *package_id, *core_id, *hwthread_id, *locality_order;
+    int        rc;
+
+    rc = gmx_cpuid_topology(cpuid,
+                            &nprocessors, &npackages,
+                            &ncores_per_package, &nhwthreads_per_core,
+                            &package_id, &core_id,
+                            &hwthread_id, &locality_order);
+
+    if (rc == 0)
+    {
+        return npackages*ncores_per_package;
+    }
+    else
+    {
+        /* We don't have cpuid topology info, return 0 core count */
+        return 0;
+    }
+}
 
 /* Return the number of hardware threads supported by the current CPU.
  * We assume that this is equal with the number of "processors"
@@ -667,10 +678,142 @@ static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr)
 #endif
 }
 
+static void gmx_collect_hardware_mpi()
+{
+#ifdef GMX_LIB_MPI
+    int  rank_id;
+    int  nrank, rank, ncore, nhwthread, ngpu, i;
+    int  gpu_hash;
+    int *buf, *all;
+
+    rank_id   = gmx_physicalnode_id_hash();
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nrank);
+    ncore     = hwinfo_g->ncore;
+    nhwthread = hwinfo_g->nthreads_hw_avail;
+    ngpu      = hwinfo_g->gpu_info.n_dev_compatible;
+    /* Create a unique hash of the GPU type(s) in this node */
+    gpu_hash  = 0;
+    /* Here it might be better to only loop over the compatible GPU, but we
+     * don't have that information available and it would also require
+     * removing the device ID from the device info string.
+     */
+    for (i = 0; i < hwinfo_g->gpu_info.n_dev; i++)
+    {
+        char stmp[STRLEN];
+
+        /* Since the device ID is incorporated in the hash, the order of
+         * the GPUs affects the hash. Also two identical GPUs won't give
+         * a gpu_hash of zero after XORing.
+         */
+        get_gpu_device_info_string(stmp, &hwinfo_g->gpu_info, i);
+        gpu_hash ^= gmx_string_fullhash_func(stmp, gmx_string_hash_init);
+    }
+
+    snew(buf, nrank);
+    snew(all, nrank);
+    buf[rank] = rank_id;
+
+    MPI_Allreduce(buf, all, nrank, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    gmx_bool bFound;
+    int      nnode0, ncore0, nhwthread0, ngpu0, r;
+
+    bFound     = FALSE;
+    ncore0     = 0;
+    nnode0     = 0;
+    nhwthread0 = 0;
+    ngpu0      = 0;
+    for (r = 0; r < nrank; r++)
+    {
+        if (all[r] == rank_id)
+        {
+            if (!bFound && r == rank)
+            {
+                /* We are the first rank in this physical node */
+                nnode0     = 1;
+                ncore0     = ncore;
+                nhwthread0 = nhwthread;
+                ngpu0      = ngpu;
+            }
+            bFound = TRUE;
+        }
+    }
+
+    sfree(buf);
+    sfree(all);
+
+    int sum[4], maxmin[10];
+
+    {
+        int buf[4];
+
+        /* Sum values from only intra-rank 0 so we get the sum over all nodes */
+        buf[0] = nnode0;
+        buf[1] = ncore0;
+        buf[2] = nhwthread0;
+        buf[3] = ngpu0;
+
+        MPI_Allreduce(buf, sum, 4, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    }
+
+    {
+        int buf[10];
+
+        /* Store + and - values for all ranks,
+         * so we can get max+min with one MPI call.
+         */
+        buf[0] = ncore;
+        buf[1] = nhwthread;
+        buf[2] = ngpu;
+        buf[3] = gmx_cpuid_simd_suggest(hwinfo_g->cpuid_info);
+        buf[4] = gpu_hash;
+        buf[5] = -buf[0];
+        buf[6] = -buf[1];
+        buf[7] = -buf[2];
+        buf[8] = -buf[3];
+        buf[9] = -buf[4];
+
+        MPI_Allreduce(buf, maxmin, 10, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+    }
+
+    hwinfo_g->nphysicalnode       = sum[0];
+    hwinfo_g->ncore_tot           = sum[1];
+    hwinfo_g->ncore_min           = -maxmin[5];
+    hwinfo_g->ncore_max           = maxmin[0];
+    hwinfo_g->nhwthread_tot       = sum[2];
+    hwinfo_g->nhwthread_min       = -maxmin[6];
+    hwinfo_g->nhwthread_max       = maxmin[1];
+    hwinfo_g->ngpu_compatible_tot = sum[3];
+    hwinfo_g->ngpu_compatible_min = -maxmin[7];
+    hwinfo_g->ngpu_compatible_max = maxmin[2];
+    hwinfo_g->simd_suggest_min    = static_cast<enum gmx_cpuid_simd>(-maxmin[8]);
+    hwinfo_g->simd_suggest_max    = static_cast<enum gmx_cpuid_simd>(maxmin[3]);
+    hwinfo_g->bIdenticalGPUs      = (maxmin[4] == -maxmin[9]);
+#else
+    /* All ranks use the same pointer, protect it with a mutex */
+    tMPI_Thread_mutex_lock(&hw_info_lock);
+    hwinfo_g->nphysicalnode       = 1;
+    hwinfo_g->ncore_tot           = hwinfo_g->ncore;
+    hwinfo_g->ncore_min           = hwinfo_g->ncore;
+    hwinfo_g->ncore_max           = hwinfo_g->ncore;
+    hwinfo_g->nhwthread_tot       = hwinfo_g->nthreads_hw_avail;
+    hwinfo_g->nhwthread_min       = hwinfo_g->nthreads_hw_avail;
+    hwinfo_g->nhwthread_max       = hwinfo_g->nthreads_hw_avail;
+    hwinfo_g->ngpu_compatible_tot = hwinfo_g->gpu_info.n_dev_compatible;
+    hwinfo_g->ngpu_compatible_min = hwinfo_g->gpu_info.n_dev_compatible;
+    hwinfo_g->ngpu_compatible_max = hwinfo_g->gpu_info.n_dev_compatible;
+    hwinfo_g->simd_suggest_min    = gmx_cpuid_simd_suggest(hwinfo_g->cpuid_info);
+    hwinfo_g->simd_suggest_max    = gmx_cpuid_simd_suggest(hwinfo_g->cpuid_info);
+    hwinfo_g->bIdenticalGPUs      = TRUE;
+    tMPI_Thread_mutex_unlock(&hw_info_lock);
+#endif
+}
+
 gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
                                    gmx_bool bDetectGPUs)
 {
-    int              ret;
+    int ret;
 
     /* make sure no one else is doing the same thing */
     ret = tMPI_Thread_mutex_lock(&hw_info_lock);
@@ -691,6 +834,9 @@ gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
             gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
         }
 
+        /* get the number of cores, will be 0 when not detected */
+        hwinfo_g->ncore             = get_ncores(hwinfo_g->cpuid_info);
+
         /* detect number of hardware threads */
         hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
 
@@ -719,9 +865,168 @@ gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
         gmx_fatal(FARGS, "Error unlocking hwinfo mutex: %s", strerror(errno));
     }
 
+    gmx_collect_hardware_mpi();
+
     return hwinfo_g;
 }
 
+static std::string detected_hardware_string(const gmx_hw_info_t *hwinfo,
+                                            bool                 bFullCpuInfo)
+{
+    std::string s;
+
+    s  = gmx::formatString("\n");
+    s += gmx::formatString("Running on %d node%s with total",
+                           hwinfo->nphysicalnode,
+                           hwinfo->nphysicalnode == 1 ? "" : "s");
+    if (hwinfo->ncore_tot > 0)
+    {
+        s += gmx::formatString(" %d cores,", hwinfo->ncore_tot);
+    }
+    s += gmx::formatString(" %d hardware threads", hwinfo->nhwthread_tot);
+    if (hwinfo->gpu_info.bDetectGPUs)
+    {
+        s += gmx::formatString(", %d compatible GPU%s",
+                               hwinfo->ngpu_compatible_tot,
+                               hwinfo->ngpu_compatible_tot == 1 ? "" : "s");
+    }
+    else if (bGPUBinary)
+    {
+        s += gmx::formatString(" (GPU detection deactivated)");
+    }
+    s += gmx::formatString("\n");
+
+    if (hwinfo->nphysicalnode > 1)
+    {
+        /* Print per node hardware feature counts */
+        if (hwinfo->ncore_max > 0)
+        {
+            s += gmx::formatString("Cores per node:            %2d", hwinfo->ncore_min);
+            if (hwinfo->ncore_max > hwinfo->ncore_min)
+            {
+                s += gmx::formatString(" - %2d", hwinfo->ncore_max);
+            }
+            s += gmx::formatString("\n");
+        }
+        s += gmx::formatString("Hardware threads per node: %2d", hwinfo->nhwthread_min);
+        if (hwinfo->nhwthread_max > hwinfo->nhwthread_min)
+        {
+            s += gmx::formatString(" - %2d", hwinfo->nhwthread_max);
+        }
+        s += gmx::formatString("\n");
+        if (bGPUBinary)
+        {
+            s += gmx::formatString("Compatible GPUs per node:  %2d",
+                                   hwinfo->ngpu_compatible_min);
+            if (hwinfo->ngpu_compatible_max > hwinfo->ngpu_compatible_min)
+            {
+                s += gmx::formatString(" - %2d", hwinfo->ngpu_compatible_max);
+            }
+            s += gmx::formatString("\n");
+            if (hwinfo->ngpu_compatible_tot > 0)
+            {
+                if (hwinfo->bIdenticalGPUs)
+                {
+                    s += gmx::formatString("All nodes have identical type(s) of GPUs\n");
+                }
+                else
+                {
+                    /* This message will also appear with identical GPU types
+                     * when at least one node has no GPU.
+                     */
+                    s += gmx::formatString("Different nodes have different type(s) and/or order of GPUs\n");
+                }
+            }
+        }
+    }
+
+#ifdef GMX_LIB_MPI
+    char host[255];
+    int  rank;
+
+    gmx_gethostname(host, 255);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    s += gmx::formatString("Hardware detected on host %s (the node of MPI rank %d):\n",
+                           host, rank);
+#else
+    s += gmx::formatString("Hardware detected:\n");
+#endif
+    s += gmx::formatString("  CPU info:\n");
+    if (bFullCpuInfo)
+    {
+        char buf[1024];
+
+        gmx_cpuid_formatstring(hwinfo->cpuid_info, buf, 1023);
+        buf[1023] = '\0';
+
+        s += gmx::formatString("%s", buf);
+    }
+    else
+    {
+        s += gmx::formatString("    Vendor: %s\n",
+                               gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
+        s += gmx::formatString("    Brand:  %s\n",
+                               gmx_cpuid_brand(hwinfo->cpuid_info));
+    }
+    s += gmx::formatString("    SIMD instructions most likely to fit this hardware: %s",
+                           gmx_cpuid_simd_string[hwinfo->simd_suggest_min]);
+    if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
+    {
+        s += gmx::formatString(" - %s",
+                               gmx_cpuid_simd_string[hwinfo->simd_suggest_max]);
+    }
+    s += gmx::formatString("\n");
+    s += gmx::formatString("    SIMD instructions selected at GROMACS compile time: %s\n",
+                           gmx_cpuid_simd_string[gmx_compiled_simd()]);
+    if (bGPUBinary && (hwinfo->ngpu_compatible_tot > 0 ||
+                       hwinfo->gpu_info.n_dev > 0))
+    {
+        s += gmx::formatString("  GPU info:\n");
+        s += gmx::formatString("    Number of GPUs detected: %d\n",
+                               hwinfo->gpu_info.n_dev);
+        if (hwinfo->gpu_info.n_dev > 0)
+        {
+            char buf[STRLEN];
+
+            sprint_gpus(buf, &hwinfo->gpu_info);
+            s += gmx::formatString("%s\n", buf);
+        }
+    }
+
+    return s;
+}
+
+void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
+                                 const gmx_hw_info_t *hwinfo)
+{
+    if (fplog != NULL)
+    {
+        std::string detected;
+
+        detected = detected_hardware_string(hwinfo, TRUE);
+
+        fprintf(fplog, "%s\n", detected.c_str());
+    }
+
+    if (MULTIMASTER(cr))
+    {
+        std::string detected;
+
+        detected = detected_hardware_string(hwinfo, FALSE);
+
+        fprintf(stderr, "%s\n", detected.c_str());
+    }
+
+    /* Check the compiled SIMD instruction set against that of the node
+     * with the lowest SIMD level support.
+     */
+    gmx_cpuid_simd_check(hwinfo->simd_suggest_min, fplog, MULTIMASTER(cr));
+
+    /* For RDTSCP we only check on our local node and skip the MPI reduction */
+    check_use_of_rdtscp_on_this_cpu(fplog, cr, hwinfo);
+}
+
 void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
 {
     char *env;
diff --git a/src/gromacs/gmxlib/gmx_omp_nthreads.c b/src/gromacs/gmxlib/gmx_omp_nthreads.c
index dc1688d36e..9b97291fc4 100644
--- a/src/gromacs/gmxlib/gmx_omp_nthreads.c
+++ b/src/gromacs/gmxlib/gmx_omp_nthreads.c
@@ -426,6 +426,7 @@ reportOpenmpSettings(FILE            *fplog,
                       modth.gnth_pme, modth.gnth_pme > 1 ? "s" : "",
                       cr->nnodes > 1 ? mpi_str : "");
     }
+    md_print_info(cr, fplog, "\n");
 }
 
 /*! \brief Detect and warn about oversubscription of cores.
diff --git a/src/gromacs/gmxlib/main.cpp b/src/gromacs/gmxlib/main.cpp
index 5286b27d0d..28644ba909 100644
--- a/src/gromacs/gmxlib/main.cpp
+++ b/src/gromacs/gmxlib/main.cpp
@@ -255,7 +255,7 @@ void gmx_log_open(const char *lognm, const t_commrec *cr,
         gmx::printBinaryInformation(fp, gmx::getProgramContext(), settings);
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
-    fprintf(fp, "\n\n");
+    fprintf(fp, "\n");
 
     fflush(fp);
     debug_gmx();
diff --git a/src/gromacs/legacyheaders/gmx_cpuid.h b/src/gromacs/legacyheaders/gmx_cpuid.h
index 517bf2e7a2..a997ff6fc5 100644
--- a/src/gromacs/legacyheaders/gmx_cpuid.h
+++ b/src/gromacs/legacyheaders/gmx_cpuid.h
@@ -127,8 +127,12 @@ enum gmx_cpuid_feature
 
 /* Currently supported SIMD instruction sets, intrinsics or other similar combinations
  * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD
- * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed),
- * and we still haven't written the AVX2 kernels.
+ * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed).
+ * These are listed in increasing order for sets supported by one CPU.
+ * The order is only used for printing "minimum" and "maximum" suggested
+ * SIMD instruction sets for nodes in a cluster, so pairs like
+ * GMX_CPUID_SIMD_X86_AVX_128_FMA vs GMX_CPUID_SIMD_X86_AVX_256 which strictly
+ * speaking can't be ordered are not really an issue.
  */
 enum gmx_cpuid_simd
 {
@@ -169,6 +173,11 @@ typedef struct gmx_cpuid *
     gmx_cpuid_t;
 
 
+/* Return the SIMD instruction set GROMACS was compiled with. */
+enum gmx_cpuid_simd
+gmx_compiled_simd           ();
+
+
 /* Fill the data structure by using CPU detection instructions.
  * Return 0 on success, 1 if something bad happened.
  */
@@ -316,9 +325,12 @@ gmx_cpuid_simd_suggest  (gmx_cpuid_t                    cpuid);
  * would suggest for the current hardware. Always print stats to the log file
  * if it is non-NULL, and if we don't have a match, print a warning in log
  * (if non-NULL) and if print_to_stderr!=0 also to stderr.
+ * The suggested SIMD instruction set simd_suggest is obtained with
+ * gmx_cpuid_simd_suggest(), but with MPI this might be different for
+ * different nodes, so it shoul be passed here after parallel reduction.
  */
 int
-gmx_cpuid_simd_check    (gmx_cpuid_t                cpuid,
+gmx_cpuid_simd_check    (enum gmx_cpuid_simd        simd_suggest,
                          FILE *                     log,
                          int                        print_to_stderr);
 
diff --git a/src/gromacs/legacyheaders/gmx_detect_hardware.h b/src/gromacs/legacyheaders/gmx_detect_hardware.h
index 4f74e5e061..56b0384fe6 100644
--- a/src/gromacs/legacyheaders/gmx_detect_hardware.h
+++ b/src/gromacs/legacyheaders/gmx_detect_hardware.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -53,6 +53,12 @@ extern "C" {
 gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
                                    gmx_bool bDetectGPUs);
 
+/* Print information about the detected hardware to fplog (if != NULL)
+ * and to stderr the master rank.
+ */
+void gmx_print_detected_hardware(FILE *fplog, const t_commrec *cr,
+                                 const gmx_hw_info_t *hwinfo);
+
 void gmx_hardware_info_free(gmx_hw_info_t *hwinfo);
 
 void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt);
diff --git a/src/gromacs/legacyheaders/types/hw_info.h b/src/gromacs/legacyheaders/types/hw_info.h
index 842dfbe64b..51c3b9812f 100644
--- a/src/gromacs/legacyheaders/types/hw_info.h
+++ b/src/gromacs/legacyheaders/types/hw_info.h
@@ -80,14 +80,36 @@ struct gmx_gpu_info_t
  *       (i.e. must be able to be shared among all threads) */
 typedef struct
 {
-    struct gmx_gpu_info_t gpu_info;      /* Information about GPUs detected in the system */
-
-    gmx_cpuid_t           cpuid_info;    /* CPUID information about CPU detected;
-                                            NOTE: this will only detect the CPU thread 0 of the
-                                            current process runs on. */
-    int             nthreads_hw_avail;   /* Number of hardware threads available; this number
-                                            is based on the number of CPUs reported as available
-                                            by the OS at the time of detection. */
+    /* Data for our local physical node */
+    struct gmx_gpu_info_t gpu_info;          /* Information about GPUs detected in the system */
+
+    gmx_cpuid_t           cpuid_info;        /* CPUID information about CPU detected;
+                                                NOTE: this will only detect the CPU thread 0 of the
+                                                current process runs on. */
+    int                   ncore;             /* Number of cores, will be 0 when not detected */
+    int                   nthreads_hw_avail; /* Number of hardware threads available; this number
+                                                is based on the number of CPUs reported as available
+                                                by the OS at the time of detection. */
+
+    /* Data reduced through MPI over all physical nodes */
+    int                 nphysicalnode;       /* Number of physical nodes */
+    int                 ncore_tot;           /* Sum of #cores over all nodes, can be 0 */
+    int                 ncore_min;           /* Min #cores over all nodes */
+    int                 ncore_max;           /* Max #cores over all nodes */
+    int                 nhwthread_tot;       /* Sum of #hwthreads over all nodes */
+    int                 nhwthread_min;       /* Min #hwthreads over all nodes */
+    int                 nhwthread_max;       /* Max #hwthreads over all nodes */
+    int                 ngpu_compatible_tot; /* Sum of #GPUs over all nodes */
+    int                 ngpu_compatible_min; /* Min #GPUs over all nodes */
+    int                 ngpu_compatible_max; /* Max #GPUs over all nodes */
+
+    /* The values below are only used for printing, so here it's not an issue
+     * that stricly speaking SIMD instruction sets can't be uniquely ordered.
+     */
+    enum gmx_cpuid_simd simd_suggest_min;    /* Highest SIMD instruction set supported by all ranks */
+    enum gmx_cpuid_simd simd_suggest_max;    /* Highest SIMD instruction set supported by at least one rank */
+
+    gmx_bool            bIdenticalGPUs;      /* TRUE if all ranks have the same type(s) and order of GPUs */
 } gmx_hw_info_t;
 
 
diff --git a/src/programs/mdrun/mdrun.cpp b/src/programs/mdrun/mdrun.cpp
index 508f6d03b6..c584010d3d 100644
--- a/src/programs/mdrun/mdrun.cpp
+++ b/src/programs/mdrun/mdrun.cpp
@@ -60,7 +60,6 @@
 #include "gromacs/commandline/pargs.h"
 #include "gromacs/fileio/filenm.h"
 #include "gromacs/legacyheaders/checkpoint.h"
-#include "gromacs/legacyheaders/copyrite.h"
 #include "gromacs/legacyheaders/macros.h"
 #include "gromacs/legacyheaders/main.h"
 #include "gromacs/legacyheaders/mdrun.h"
@@ -582,10 +581,6 @@ int gmx_mdrun(int argc, char *argv[])
     {
         gmx_log_open(ftp2fn(efLOG, NFILE, fnm), cr,
                      Flags & MD_APPENDFILES, &fplog);
-        please_cite(fplog, "Hess2008b");
-        please_cite(fplog, "Spoel2005a");
-        please_cite(fplog, "Lindahl2001a");
-        please_cite(fplog, "Berendsen95a");
     }
     else
     {
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 8956d4442b..3f520dd22e 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -53,6 +53,7 @@
 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
 #include "gromacs/legacyheaders/checkpoint.h"
 #include "gromacs/legacyheaders/constr.h"
+#include "gromacs/legacyheaders/copyrite.h"
 #include "gromacs/legacyheaders/disre.h"
 #include "gromacs/legacyheaders/force.h"
 #include "gromacs/legacyheaders/gmx_detect_hardware.h"
@@ -1020,6 +1021,16 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
      * global for this process (MPI rank). */
     hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
 
+    gmx_print_detected_hardware(fplog, cr, hwinfo);
+
+    if (fplog != NULL)
+    {
+        /* Print references after all software/hardware printing */
+        please_cite(fplog, "Hess2008b");
+        please_cite(fplog, "Spoel2005a");
+        please_cite(fplog, "Lindahl2001a");
+        please_cite(fplog, "Berendsen95a");
+    }
 
     snew(state, 1);
     if (SIMMASTER(cr))
@@ -1181,6 +1192,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     if (fplog != NULL)
     {
         pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+        fprintf(fplog, "\n");
     }
 
     /* now make sure the state is initialized and propagated */