Move GPU implementation to new interface

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 12 Dec 2014 15:56:27 +0000 (16:56 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 30 Apr 2015 08:43:05 +0000 (10:43 +0200)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 12 Dec 2014 15:56:27 +0000 (16:56 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 30 Apr 2015 08:43:05 +0000 (10:43 +0200)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 00b643ea05978b41b5b395049751c851c6fdc8d9..28ffa7ab2dc0cf7b35ddf6fcdf93d3155debe774 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -5689,8 +5689,7 @@ void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
      gmx_domdec_t *dd;
      MPI_Comm      mpi_comm_pp_physicalnode;
  
-    if (!(cr->duty & DUTY_PP) ||
-        hw_opt->gpu_opt.ncuda_dev_use == 0)
+    if (!(cr->duty & DUTY_PP) || hw_opt->gpu_opt.n_dev_use == 0)
      {
          /* Only PP nodes (currently) use GPUs.
           * If we don't have GPUs, there are no resources to share.
@@ -5700,7 +5699,7 @@ void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
  
      physicalnode_id_hash = gmx_physicalnode_id_hash();
  
-    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
+    gpu_id = get_cuda_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
  
      dd = cr->dd;
  
diff --git a/src/gromacs/ewald/pme-load-balancing.c b/src/gromacs/ewald/pme-load-balancing.c

index 17fd5f768a0d5fe2153c4203714cd9de1c01647b..758a76a38ebc5ccb4165b85d4378d3a5de83c499 100644 (file)
--- a/src/gromacs/ewald/pme-load-balancing.c
+++ b/src/gromacs/ewald/pme-load-balancing.c
@@ -48,7 +48,7 @@
  #include "gromacs/legacyheaders/sim_util.h"
  #include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/math/vec.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/smalloc.h"
@@ -702,7 +702,7 @@ gmx_bool pme_load_balance(pme_load_balancing_t       pme_lb,
      }
  
      bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+    nbnxn_gpu_pme_loadbal_update_param(nbv, ic);
  
      /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
       * also sharing texture references. To keep the code simple, we don't
diff --git a/src/gromacs/gmxlib/CMakeLists.txt b/src/gromacs/gmxlib/CMakeLists.txt

index 991123e37ed23c81c62a8847223cf94bde7c908d..5aeb346e20a1a469863a11928fce36fbe43d09a2 100644 (file)
--- a/src/gromacs/gmxlib/CMakeLists.txt
+++ b/src/gromacs/gmxlib/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2009,2010,2012,2014, by the GROMACS development team, led by
+# Copyright (c) 2009,2010,2012,2014,2015, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -42,9 +42,8 @@ file(GLOB GMXLIB_SOURCES *.c *.cpp)
  
  # gpu utils + cuda tools module
  if(GMX_GPU)
-    # The log file output queries Cuda if GPU support is enabled
      add_subdirectory(cuda_tools)
-    add_subdirectory(gpu_utils)
  endif()
+add_subdirectory(gpu_utils)
  
  set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${NONBONDED_SOURCES} PARENT_SCOPE)
diff --git a/src/gromacs/gmxlib/copyrite.cpp b/src/gromacs/gmxlib/copyrite.cpp

index 25225b882e8b06b98d46cd1f8b439b470b9257aa..2748e810dce30b0ce046ee7460ad4e1a897934a8 100644 (file)
--- a/src/gromacs/gmxlib/copyrite.cpp
+++ b/src/gromacs/gmxlib/copyrite.cpp
@@ -646,7 +646,7 @@ const char *Program(void)
  }
  
  
-extern void gmx_print_version_info_gpu(FILE *fp);
+extern void gmx_print_version_info_cuda_gpu(FILE *fp);
  
  static void gmx_print_version_info(FILE *fp)
  {
@@ -745,7 +745,7 @@ static void gmx_print_version_info(FILE *fp)
              BOOST_VERSION / 100 % 1000, BOOST_VERSION % 100,
              bExternalBoost ? " (external)" : " (internal)");
  #ifdef GMX_GPU
-    gmx_print_version_info_gpu(fp);
+    gmx_print_version_info_cuda_gpu(fp);
  #endif
  }
  
diff --git a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu

index 04fc98075afbe244445545944130c5b30608e813..6e3bc17e87bb6bea8d1acc2cf15998a3aedad060 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
+++ b/src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu
@@ -39,7 +39,7 @@
  
  #include "buildinfo.h"
  
-void gmx_print_version_info_gpu(FILE *fp)
+void gmx_print_version_info_cuda_gpu(FILE *fp)
  {
      int cuda_driver, cuda_runtime;
      fprintf(fp, "CUDA compiler:      %s\n", CUDA_NVCC_COMPILER_INFO);
diff --git a/src/gromacs/gmxlib/cuda_tools/cudautils.cuh b/src/gromacs/gmxlib/cuda_tools/cudautils.cuh

index dbb9f27c1bba4b07f8e3f9fa531a93222e117c51..cdd9dbc052bd4766fdc807a773abd64954b19ccb 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
+++ b/src/gromacs/gmxlib/cuda_tools/cudautils.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -129,8 +129,7 @@ extern "C" {
  #endif
  
  /*! CUDA device information. */
-typedef struct cuda_dev_info cuda_dev_info_t;
-struct cuda_dev_info
+struct gmx_device_info_t
  {
      int                 id;                     /* id of the CUDA device */
      cudaDeviceProp      prop;                   /* CUDA device properties */
diff --git a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu

index cfcc620c19372e0740893214cb81397ab7afdf6f..eeef88fd41e02197b191968da5a0c7c7d422bb1b 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
+++ b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -32,6 +32,11 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \internal \file
+ *  \brief Define functions for host-side memory handling when using CUDA devices.
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ */
  
  #include "gmxpre.h"
  
diff --git a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.h b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.h

index d45890802c8bc1ec013649385aee67288af57ca4..cd87a4524ccd8fecfcd0a6dc2989331a64bc724c 100644 (file)
--- a/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.h
+++ b/src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -32,36 +32,35 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \libinternal \file
+ *  \brief Declare functions for host-side memory handling when using CUDA devices.
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ *  \inlibraryapi
+ */
  
-#ifndef PMALLOC_CUDA_H
-#define PMALLOC_CUDA_H
-
-#include "config.h"
+#ifndef GMX_GMXLIB_CUDA_TOOLS_PMALLOC_CUDA_H
+#define GMX_GMXLIB_CUDA_TOOLS_PMALLOC_CUDA_H
  
  #include <stdlib.h>
  
-#include "gromacs/legacyheaders/types/simple.h"
-
-#ifdef GMX_GPU
-#define FUNC_TERM ;
-#else
-#define FUNC_TERM {}
-#endif
+#include "gromacs/utility/basedefinitions.h"
  
  #ifdef __cplusplus
  extern "C" {
  #endif
  
-/** Allocates nbytes of page-locked memory. */
-void pmalloc(void gmx_unused **h_ptr, size_t gmx_unused nbytes) FUNC_TERM
+/*! \brief Allocates nbytes of page-locked memory. */
+void pmalloc(void **h_ptr, size_t nbytes);
  
-/** Allocates nbytes of page-locked memory with write-combining. */
-void pmalloc_wc(void gmx_unused **h_ptr, size_t gmx_unused nbytes) FUNC_TERM
+/*! \brief Allocates nbytes of page-locked memory with write-combining. */
+void pmalloc_wc(void **h_ptr, size_t nbytes);
  
-/** Frees page locked memory allocated with pmalloc. */
-void pfree(void gmx_unused *h_ptr) FUNC_TERM
+/*! \brief Frees page locked memory allocated with pmalloc. */
+void pfree(void *h_ptr);
  
  #ifdef __cplusplus
  }
  #endif
-#endif  /* PMALLOC_CUDA_H */
+
+#endif
diff --git a/src/gromacs/gmxlib/gmx_detect_hardware.cpp b/src/gromacs/gmxlib/gmx_detect_hardware.cpp

index 490bd87327ca2353d63805fbf186662fad0d2b10..faa5678e8db23465def74fb5517ff3fbc7ce858c 100644 (file)
--- a/src/gromacs/gmxlib/gmx_detect_hardware.cpp
+++ b/src/gromacs/gmxlib/gmx_detect_hardware.cpp
@@ -103,7 +103,7 @@ static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info)
      int      i, ndev;
      char     stmp[STRLEN];
  
-    ndev = gpu_info->ncuda_dev;
+    ndev = gpu_info->n_dev;
  
      sbuf[0] = '\0';
      for (i = 0; i < ndev; i++)
@@ -131,7 +131,7 @@ static void print_gpu_detection_stats(FILE                 *fplog,
          return;
      }
  
-    ngpu = gpu_info->ncuda_dev;
+    ngpu = gpu_info->n_dev;
  
  #if defined GMX_MPI && !defined GMX_THREAD_MPI
      /* We only print the detection on one, of possibly multiple, nodes */
@@ -167,8 +167,8 @@ makeGpuUsageReport(const gmx_gpu_info_t *gpu_info,
                     const gmx_gpu_opt_t  *gpu_opt,
                     size_t                numPpRanks)
  {
-    int ngpu_use  = gpu_opt->ncuda_dev_use;
-    int ngpu_comp = gpu_info->ncuda_dev_compatible;
+    int ngpu_use  = gpu_opt->n_dev_use;
+    int ngpu_comp = gpu_info->n_dev_compatible;
  
      /* Issue a note if GPUs are available but not used */
      if (ngpu_comp > 0 && ngpu_use < 1)
@@ -181,14 +181,14 @@ makeGpuUsageReport(const gmx_gpu_info_t *gpu_info,
      std::string output;
      if (!gpu_opt->bUserSet)
      {
-        // gpu_opt->cuda_dev_compatible is only populated during auto-selection
+        // gpu_opt->dev_compatible is only populated during auto-selection
          std::string gpuIdsString =
-            formatAndJoin(gmx::constArrayRefFromArray(gpu_opt->cuda_dev_compatible,
-                                                      gpu_opt->ncuda_dev_compatible),
+            formatAndJoin(gmx::constArrayRefFromArray(gpu_opt->dev_compatible,
+                                                      gpu_opt->n_dev_compatible),
                            ",", gmx::StringFormatter("%d"));
-        bool bPluralGpus = gpu_opt->ncuda_dev_compatible > 1;
+        bool bPluralGpus = gpu_opt->n_dev_compatible > 1;
          output += gmx::formatString("%d compatible GPU%s %s present, with ID%s %s\n",
-                                    gpu_opt->ncuda_dev_compatible,
+                                    gpu_opt->n_dev_compatible,
                                      bPluralGpus ? "s" : "",
                                      bPluralGpus ? "are" : "is",
                                      bPluralGpus ? "s" : "",
@@ -196,10 +196,10 @@ makeGpuUsageReport(const gmx_gpu_info_t *gpu_info,
      }
  
      {
-        std::vector<int> gpuIdsInUse;
+        std::vector<int>   gpuIdsInUse;
          for (int i = 0; i < ngpu_use; i++)
          {
-            gpuIdsInUse.push_back(get_gpu_device_id(gpu_info, gpu_opt, i));
+            gpuIdsInUse.push_back(get_cuda_gpu_device_id(gpu_info, gpu_opt, i));
          }
          std::string gpuIdsString =
              formatAndJoin(gpuIdsInUse, ",", gmx::StringFormatter("%d"));
@@ -305,7 +305,7 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
      /* NOTE: this print is only for and on one physical node */
      print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
  
-    if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
+    if (hwinfo->gpu_info.n_dev_compatible > 0)
      {
          std::string gpuUseageReport;
          try
@@ -352,14 +352,14 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
          sprintf(th_or_proc, "process");
      }
  
-    if (bUseGPU && hwinfo->gpu_info.ncuda_dev_compatible > 0 &&
+    if (bUseGPU && hwinfo->gpu_info.n_dev_compatible > 0 &&
          !bEmulateGPU)
      {
          int  ngpu_comp, ngpu_use;
          char gpu_comp_plural[2], gpu_use_plural[2];
  
-        ngpu_comp = hwinfo->gpu_info.ncuda_dev_compatible;
-        ngpu_use  = hw_opt->gpu_opt.ncuda_dev_use;
+        ngpu_comp = hwinfo->gpu_info.n_dev_compatible;
+        ngpu_use  = hw_opt->gpu_opt.n_dev_use;
  
          sprintf(gpu_comp_plural, "%s", (ngpu_comp > 1) ? "s" : "");
          sprintf(gpu_use_plural,  "%s", (ngpu_use > 1) ? "s" : "");
@@ -475,7 +475,7 @@ void gmx_check_hw_runconf_consistency(FILE                *fplog,
  int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt)
  {
      int      same_count    = 0;
-    int      ngpu          = gpu_opt->ncuda_dev_use;
+    int      ngpu          = gpu_opt->n_dev_use;
  
      if (gpu_opt->bUserSet)
      {
@@ -485,8 +485,8 @@ int gmx_count_gpu_dev_shared(const gmx_gpu_opt_t *gpu_opt)
          {
              for (j = i + 1; j < ngpu; j++)
              {
-                same_count      += (gpu_opt->cuda_dev_use[i] ==
-                                    gpu_opt->cuda_dev_use[j]);
+                same_count      += (gpu_opt->dev_use[i] ==
+                                    gpu_opt->dev_use[j]);
              }
          }
      }
@@ -509,16 +509,17 @@ static int gmx_count_gpu_dev_unique(const gmx_gpu_info_t *gpu_info,
      assert(gpu_info);
      assert(gpu_opt);
  
-    ngpu        = gpu_info->ncuda_dev;
+    ngpu = gpu_info->n_dev;
+
      uniq_count  = 0;
  
      snew(uniq_ids, ngpu);
  
      /* Each element in uniq_ids will be set to 0 or 1. The n-th element set
       * to 1 indicates that the respective GPU was selected to be used. */
-    for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+    for (i = 0; i < gpu_opt->n_dev_use; i++)
      {
-        uniq_ids[get_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
+        uniq_ids[get_cuda_gpu_device_id(gpu_info, gpu_opt, i)] = 1;
      }
      /* Count the devices used. */
      for (i = 0; i < ngpu; i++)
@@ -624,7 +625,7 @@ static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr)
      {
          char detection_error[STRLEN] = "", sbuf[STRLEN];
  
-        if (detect_cuda_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
+        if (detect_gpus(&hwinfo_g->gpu_info, detection_error) != 0)
          {
              if (detection_error[0] != '\0')
              {
@@ -643,22 +644,22 @@ static void gmx_detect_gpus(FILE *fplog, const t_commrec *cr)
  
  #ifdef GMX_LIB_MPI
      /* Broadcast the GPU info to the other ranks within this node */
-    MPI_Bcast(&hwinfo_g->gpu_info.ncuda_dev, 1, MPI_INT, 0, physicalnode_comm);
+    MPI_Bcast(&hwinfo_g->gpu_info.n_dev, 1, MPI_INT, 0, physicalnode_comm);
  
-    if (hwinfo_g->gpu_info.ncuda_dev > 0)
+    if (hwinfo_g->gpu_info.n_dev > 0)
      {
-        int cuda_dev_size;
+        int dev_size;
  
-        cuda_dev_size = hwinfo_g->gpu_info.ncuda_dev*sizeof_cuda_dev_info();
+        dev_size = hwinfo_g->gpu_info.n_dev*sizeof_gpu_dev_info();
  
          if (rank_local > 0)
          {
-            hwinfo_g->gpu_info.cuda_dev =
-                (cuda_dev_info_ptr_t)malloc(cuda_dev_size);
+            hwinfo_g->gpu_info.gpu_dev =
+                (struct gmx_device_info_t *)malloc(dev_size);
          }
-        MPI_Bcast(hwinfo_g->gpu_info.cuda_dev, cuda_dev_size, MPI_BYTE,
+        MPI_Bcast(hwinfo_g->gpu_info.gpu_dev, dev_size, MPI_BYTE,
                    0, physicalnode_comm);
-        MPI_Bcast(&hwinfo_g->gpu_info.ncuda_dev_compatible, 1, MPI_INT,
+        MPI_Bcast(&hwinfo_g->gpu_info.n_dev_compatible, 1, MPI_INT,
                    0, physicalnode_comm);
      }
  
@@ -694,9 +695,9 @@ gmx_hw_info_t *gmx_detect_hardware(FILE *fplog, const t_commrec *cr,
          hwinfo_g->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
  
          /* detect GPUs */
-        hwinfo_g->gpu_info.ncuda_dev            = 0;
-        hwinfo_g->gpu_info.cuda_dev             = NULL;
-        hwinfo_g->gpu_info.ncuda_dev_compatible = 0;
+        hwinfo_g->gpu_info.n_dev            = 0;
+        hwinfo_g->gpu_info.n_dev_compatible = 0;
+        hwinfo_g->gpu_info.gpu_dev          = NULL;
  
          /* Run the detection if the binary was compiled with GPU support
           * and we requested detection.
@@ -747,10 +748,10 @@ void gmx_parse_gpu_ids(gmx_gpu_opt_t *gpu_opt)
           * digits corresponding to GPU IDs; the order will indicate
           * the process/tMPI thread - GPU assignment. */
          parse_digits_from_plain_string(env,
-                                       &gpu_opt->ncuda_dev_use,
-                                       &gpu_opt->cuda_dev_use);
+                                       &gpu_opt->n_dev_use,
+                                       &gpu_opt->dev_use);
  
-        if (gpu_opt->ncuda_dev_use == 0)
+        if (gpu_opt->n_dev_use == 0)
          {
              gmx_fatal(FARGS, "Empty GPU ID string encountered.\n%s\n",
                        invalid_gpuid_hint);
@@ -783,21 +784,21 @@ void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
          int *checkres;
          int  res;
  
-        snew(checkres, gpu_opt->ncuda_dev_use);
+        snew(checkres, gpu_opt->n_dev_use);
  
-        res = check_selected_cuda_gpus(checkres, gpu_info, gpu_opt);
+        res = check_selected_gpus(checkres, gpu_info, gpu_opt);
  
          if (!res)
          {
              print_gpu_detection_stats(fplog, gpu_info, cr);
  
              sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
-            for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+            for (i = 0; i < gpu_opt->n_dev_use; i++)
              {
                  if (checkres[i] != egpuCompatible)
                  {
                      sprintf(stmp, "    GPU #%d: %s\n",
-                            gpu_opt->cuda_dev_use[i],
+                            gpu_opt->dev_use[i],
                              gpu_detect_res_str[checkres[i]]);
                      strcat(sbuf, stmp);
                  }
@@ -814,7 +815,7 @@ void gmx_select_gpu_ids(FILE *fplog, const t_commrec *cr,
      }
  
      /* If the user asked for a GPU, check whether we have a GPU */
-    if (bForceUseGPU && gpu_info->ncuda_dev_compatible == 0)
+    if (bForceUseGPU && gpu_info->n_dev_compatible == 0)
      {
          gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
      }
@@ -829,16 +830,16 @@ static void limit_num_gpus_used(gmx_gpu_opt_t *gpu_opt, int maxNumberToUse)
      GMX_RELEASE_ASSERT(gpu_opt, "Invalid gpu_opt pointer passed");
      GMX_RELEASE_ASSERT(maxNumberToUse >= 1,
                         gmx::formatString("Invalid limit (%d) for the number of GPUs (detected %d compatible GPUs)",
-                                         maxNumberToUse, gpu_opt->ncuda_dev_compatible).c_str());
+                                         maxNumberToUse, gpu_opt->n_dev_compatible).c_str());
  
      /* Don't increase the number of GPUs used beyond (e.g.) the number
         of PP ranks */
-    gpu_opt->ncuda_dev_use = std::min(gpu_opt->ncuda_dev_compatible, maxNumberToUse);
-    snew(gpu_opt->cuda_dev_use, gpu_opt->ncuda_dev_use);
-    for (int i = 0; i != gpu_opt->ncuda_dev_use; ++i)
+    gpu_opt->n_dev_use = std::min(gpu_opt->n_dev_compatible, maxNumberToUse);
+    snew(gpu_opt->dev_use, gpu_opt->n_dev_use);
+    for (int i = 0; i != gpu_opt->n_dev_use; ++i)
      {
          /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
-        gpu_opt->cuda_dev_use[i] = gpu_opt->cuda_dev_compatible[i];
+        gpu_opt->dev_use[i] = gpu_opt->dev_compatible[i];
      }
  }
  
diff --git a/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt b/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt

index dee9b3ee078c3b827e9a4b8f59e6eafcb3256747..2d8565e3aa15e899254ccdc81cd03c842f23f653 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gmxlib/gpu_utils/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -32,6 +32,9 @@
  # To help us fund GROMACS development, we humbly ask that you cite
  # the research papers on the package. Check out http://www.gromacs.org.
  
-file(GLOB GPU_UTILS_SOURCES *.cu)
+if(GMX_GPU)
+    file(GLOB GPU_UTILS_SOURCES *.cu)
+else()
+    file(GLOB GPU_UTILS_SOURCES *.cpp)
+endif()
  set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${GPU_UTILS_SOURCES} PARENT_SCOPE)
-
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_macros.h b/src/gromacs/gmxlib/gpu_utils/gpu_macros.h

new file mode 100644 (file)

index 0000000..92f1a5c
--- /dev/null
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_macros.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GMXLIB_GPU_UTILS_MACROS_H
+#define GMX_GMXLIB_GPU_UTILS_MACROS_H
+
+#include "config.h"
+
+/* These macros that let us define inlineable null implementations so
+   that non-GPU Gromacs can run with no overhead without conditionality
+   everywhere a GPU function is called. */
+#define REAL_FUNC_QUALIFIER
+#define REAL_FUNC_TERM ;
+#define REAL_FUNC_TERM_WITH_RETURN(arg) ;
+
+#define NULL_FUNC_QUALIFIER static
+#define NULL_FUNC_TERM {}
+#define NULL_FUNC_TERM_WITH_RETURN(arg) { return (arg); }
+
+#if defined GMX_GPU
+
+#define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define GPU_FUNC_TERM REAL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#define CUDA_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#define CUDA_FUNC_TERM REAL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+
+#else /* No accelerator support */
+
+#define GPU_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define GPU_FUNC_TERM NULL_FUNC_TERM
+#define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#define CUDA_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#define CUDA_FUNC_TERM NULL_FUNC_TERM
+#define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+
+#endif
+
+#endif
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cpp b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cpp

new file mode 100644 (file)

index 0000000..f2e423e
--- /dev/null
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cpp
@@ -0,0 +1,54 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Stub functions for non-GPU builds
+ *
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "gpu_utils.h"
+
+/*! \brief Set allocation functions used by the GPU host
+ *
+ * Since GPU support is not configured, there is no host memory to
+ * allocate. */
+void gpu_set_host_malloc_and_free(bool,
+                                  gmx_host_alloc_t **nb_alloc,
+                                  gmx_host_free_t  **nb_free)
+{
+    *nb_alloc = NULL;
+    *nb_free  = NULL;
+}
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu

index de1682da1e9bb33d9cbbe75365d54a792ffd41ee..3d93445f8d7e4762ea4467215ff1498d7a6bfe28 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
@@ -32,6 +32,11 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \file
+ *  \brief Define functions for detection and initialization for CUDA devices.
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ */
  
  #include "gmxpre.h"
  
@@ -44,12 +49,13 @@
  #include <stdlib.h>
  
  #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
+#include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  #include "gromacs/legacyheaders/types/hw_info.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/smalloc.h"
  
-/*! \brief
+/*! \internal \brief
   * Max number of devices supported by CUDA (for consistency checking).
   *
   * In reality it is 16 with CUDA <=v5.0, but let's stay on the safe side.
@@ -232,7 +238,7 @@ static void md_print_warn(FILE       *fplog,
   * \param[in,out] cuda_dev  CUDA device information to enrich with NVML device info
   * \returns                 true if \cuda_dev could be enriched with matching NVML device information.
   */
-static bool addNVMLDeviceId(cuda_dev_info* cuda_dev)
+static bool addNVMLDeviceId(gmx_device_info_t* cuda_dev)
  {
      nvmlReturn_t nvml_stat = NVML_SUCCESS;
      nvmlDevice_t nvml_device_id;
@@ -284,7 +290,7 @@ static bool addNVMLDeviceId(cuda_dev_info* cuda_dev)
   */
  static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unused gpuid, const gmx_gpu_info_t gmx_unused *gpu_info)
  {
-    const cudaDeviceProp *prop                        = &gpu_info->cuda_dev[gpuid].prop;
+    const cudaDeviceProp *prop                        = &gpu_info->gpu_dev[gpuid].prop;
      int                   cuda_version_number         = prop->major * 10 + prop->minor;
      gmx_bool              bGpuCanUseApplicationClocks =
          ((0 == gmx_wcmatch("*Tesla*", prop->name) && cuda_version_number >= 35 ) ||
@@ -320,54 +326,54 @@ static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unus
      {
          return false;
      }
-    if (!addNVMLDeviceId( &(gpu_info->cuda_dev[gpuid])))
+    if (!addNVMLDeviceId( &(gpu_info->gpu_dev[gpuid])))
      {
          return false;
      }
      //get current application clocks setting
      unsigned int app_sm_clock  = 0;
      unsigned int app_mem_clock = 0;
-    nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
+    nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &app_sm_clock );
      if (NVML_ERROR_NOT_SUPPORTED == nvml_stat)
      {
          return false;
      }
      HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-    nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
+    nvml_stat = nvmlDeviceGetApplicationsClock ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &app_mem_clock );
      HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
      //get max application clocks
      unsigned int max_sm_clock  = 0;
      unsigned int max_mem_clock = 0;
-    nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
+    nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_SM, &max_sm_clock );
      HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
-    nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
+    nvml_stat = nvmlDeviceGetMaxClockInfo ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_CLOCK_MEM, &max_mem_clock );
      HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetMaxClockInfo failed" );
  
-    gpu_info->cuda_dev[gpuid].nvml_is_restricted     = NVML_FEATURE_ENABLED;
-    gpu_info->cuda_dev[gpuid].nvml_ap_clocks_changed = false;
+    gpu_info->gpu_dev[gpuid].nvml_is_restricted     = NVML_FEATURE_ENABLED;
+    gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = false;
  
-    nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->cuda_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->cuda_dev[gpuid].nvml_is_restricted) );
+    nvml_stat = nvmlDeviceGetAPIRestriction ( gpu_info->gpu_dev[gpuid].nvml_device_id, NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS, &(gpu_info->gpu_dev[gpuid].nvml_is_restricted) );
      HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetAPIRestriction failed" );
  
      //TODO: Need to distinguish between different type of GPUs might be necessary in the future, e.g. if max application clocks should not be used
      //      for certain GPUs.
-    if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->cuda_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
+    if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock && gpu_info->gpu_dev[gpuid].nvml_is_restricted == NVML_FEATURE_DISABLED)
      {
          //TODO: Maybe need to think about something more user friendly here.
-        md_print_info( fplog, "Changing GPU clock rates by setting application clocks for %s to (%d,%d)\n", gpu_info->cuda_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
-        nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->cuda_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
+        md_print_info( fplog, "Changing GPU clock rates by setting application clocks for %s to (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, max_mem_clock, max_sm_clock);
+        nvml_stat = nvmlDeviceSetApplicationsClocks ( gpu_info->gpu_dev[gpuid].nvml_device_id, max_mem_clock, max_sm_clock );
          HANDLE_NVML_RET_ERR( nvml_stat, "nvmlDeviceGetApplicationsClock failed" );
-        gpu_info->cuda_dev[gpuid].nvml_ap_clocks_changed = true;
+        gpu_info->gpu_dev[gpuid].nvml_ap_clocks_changed = true;
      }
      else if (nvml_stat == NVML_SUCCESS && app_sm_clock < max_sm_clock)
      {
          //TODO: Maybe need to think about something more user friendly here.
-        md_print_warn( fplog, "Not possible to change GPU clocks to optimal value because of insufficient permissions to set application clocks for %s. Current values are (%d,%d). Max values are (%d,%d)\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clock permissions.\n", gpu_info->cuda_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
+        md_print_warn( fplog, "Not possible to change GPU clocks to optimal value because of insufficient permissions to set application clocks for %s. Current values are (%d,%d). Max values are (%d,%d)\nUse sudo nvidia-smi -acp UNRESTRICTED or contact your admin to change application clock permissions.\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock, max_mem_clock, max_sm_clock);
      }
      else if (nvml_stat == NVML_SUCCESS && app_sm_clock == max_sm_clock)
      {
          //TODO: This should probably be integrated into the GPU Properties table.
-        md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->cuda_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
+        md_print_info( fplog, "Application clocks (GPU clocks) for %s are (%d,%d)\n", gpu_info->gpu_dev[gpuid].prop.name, app_mem_clock, app_sm_clock);
      }
      else
      {
@@ -378,11 +384,11 @@ static gmx_bool init_gpu_application_clocks(FILE gmx_unused *fplog, int gmx_unus
  #endif /*HAVE_NVML*/
  }
  
-/*! \brief Resets application clocks if changed and cleans up NVML for the passed \cuda_dev.
+/*! \brief Resets application clocks if changed and cleans up NVML for the passed \gpu_dev.
   *
- * \param[in] cuda_dev  CUDA device information
+ * \param[in] gpu_dev  CUDA device information
   */
-static gmx_bool reset_gpu_application_clocks(const cuda_dev_info gmx_unused * cuda_dev)
+static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused * cuda_dev)
  {
  #ifndef HAVE_NVML
      GMX_UNUSED_VALUE(cuda_dev);
@@ -402,20 +408,6 @@ static gmx_bool reset_gpu_application_clocks(const cuda_dev_info gmx_unused * cu
  #endif /*HAVE_NVML*/
  }
  
-
-/*! \brief Initializes the GPU with the given index.
- *
- * The varible \mygpu is the index of the GPU to initialize in the
- * gpu_info.cuda_dev array.
- *
- * \param[out] fplog        log file to write to
- * \param[in]  mygpu        index of the GPU to initialize
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
- * \param[in] gpu_info      GPU info of all detected devices in the system.
- * \param[in] gpu_opt       options for using the GPUs in gpu_info
- * \returns                 true if no error occurs during initialization.
- */
  gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
                    const gmx_gpu_info_t *gpu_info,
                    const gmx_gpu_opt_t *gpu_opt)
@@ -427,22 +419,22 @@ gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
      assert(gpu_info);
      assert(result_str);
  
-    if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
+    if (mygpu < 0 || mygpu >= gpu_opt->n_dev_use)
      {
          sprintf(sbuf, "Trying to initialize an inexistent GPU: "
                  "there are %d %s-selected GPU(s), but #%d was requested.",
-                gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
+                gpu_opt->n_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
          gmx_incons(sbuf);
      }
  
-    gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
+    gpuid = gpu_info->gpu_dev[gpu_opt->dev_use[mygpu]].id;
  
      stat = cudaSetDevice(gpuid);
      strncpy(result_str, cudaGetErrorString(stat), STRLEN);
  
      if (debug)
      {
-        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
+        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->gpu_dev[gpuid].prop.name);
      }
  
      //Ignoring return value as NVML errors should be treated not critical.
@@ -453,19 +445,7 @@ gmx_bool init_gpu(FILE gmx_unused *fplog, int mygpu, char *result_str,
      return (stat == cudaSuccess);
  }
  
-/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
- *
- * The context is explicitly destroyed and therefore all data uploaded to the GPU
- * is lost. This should only be called when none of this data is required anymore.
- *
- * \param[in]  mygpu        index of the GPU clean up for
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
- * \param[in] gpu_info      GPU info of all detected devices in the system.
- * \param[in] gpu_opt       options for using the GPUs in gpu_info
- * \returns                 true if no error occurs during the freeing.
- */
-gmx_bool free_gpu(
+gmx_bool free_cuda_gpu(
          int gmx_unused mygpu, char *result_str,
          const gmx_gpu_info_t gmx_unused *gpu_info,
          const gmx_gpu_opt_t gmx_unused *gpu_opt
@@ -485,10 +465,10 @@ gmx_bool free_gpu(
          fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
      }
  
-    gpuid = gpu_opt ? gpu_opt->cuda_dev_use[mygpu] : -1;
+    gpuid = gpu_opt ? gpu_opt->dev_use[mygpu] : -1;
      if (gpuid != -1)
      {
-        reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->cuda_dev[gpuid]) );
+        reset_gpu_application_clocks_status = reset_gpu_application_clocks( &(gpu_info->gpu_dev[gpuid]) );
      }
  
      stat = cudaDeviceReset();
@@ -567,30 +547,17 @@ static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
  }
  
  
-/*! \brief Detect all NVIDIA GPUs in the system.
- *
- *  Will detect every NVIDIA GPU supported by the device driver in use. Also
- *  check for the compatibility of each and fill the gpu_info->cuda_dev array
- *  with the required information on each the device: ID, device properties,
- *  status.
- *
- *  \param[in] gpu_info    pointer to structure holding GPU information.
- *  \param[out] err_str    The error message of any CUDA API error that caused
- *                         the detection to fail (if there was any). The memory
- *                         the pointer points to should be managed externally.
- *  \returns               non-zero if the detection encountered a failure, zero otherwise.
- */
-int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
+int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  {
-    int              i, ndev, checkres, retval;
-    cudaError_t      stat;
-    cudaDeviceProp   prop;
-    cuda_dev_info_t *devs;
+    int                i, ndev, checkres, retval;
+    cudaError_t        stat;
+    cudaDeviceProp     prop;
+    gmx_device_info_t *devs;
  
      assert(gpu_info);
      assert(err_str);
  
-    gpu_info->ncuda_dev_compatible = 0;
+    gpu_info->n_dev_compatible = 0;
  
      ndev    = 0;
      devs    = NULL;
@@ -621,31 +588,18 @@ int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  
              if (checkres == egpuCompatible)
              {
-                gpu_info->ncuda_dev_compatible++;
+                gpu_info->n_dev_compatible++;
              }
          }
          retval = 0;
      }
  
-    gpu_info->ncuda_dev = ndev;
-    gpu_info->cuda_dev  = devs;
+    gpu_info->n_dev   = ndev;
+    gpu_info->gpu_dev = devs;
  
      return retval;
  }
  
-/*! \brief Select the GPUs compatible with the native GROMACS acceleration.
- *
- * This function selects the compatible gpus and initializes
- * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
- *
- * Given the list of GPUs available in the system check each device in
- * gpu_info->cuda_dev and place the indices of the compatible GPUs into
- * cuda_dev_use with this marking the respective GPUs as "available for use."
- * Note that \detect_cuda_gpus must have been called before.
- *
- * \param[in]     gpu_info    pointer to structure holding GPU information
- * \param[in,out] gpu_opt     pointer to structure holding GPU options
- */
  void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
                            gmx_gpu_opt_t        *gpu_opt)
  {
@@ -653,69 +607,57 @@ void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
      int *compat;
  
      assert(gpu_info);
-    /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
-    assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
+    /* gpu_dev/n_dev have to be either NULL/0 or not (NULL/0) */
+    assert((gpu_info->n_dev != 0 ? 0 : 1) ^ (gpu_info->gpu_dev == NULL ? 0 : 1));
  
-    snew(compat, gpu_info->ncuda_dev);
+    snew(compat, gpu_info->n_dev);
      ncompat = 0;
-    for (i = 0; i < gpu_info->ncuda_dev; i++)
+    for (i = 0; i < gpu_info->n_dev; i++)
      {
-        if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
+        if (is_compatible_gpu(gpu_info->gpu_dev[i].stat))
          {
              ncompat++;
              compat[ncompat - 1] = i;
          }
      }
  
-    gpu_opt->ncuda_dev_compatible = ncompat;
-    snew(gpu_opt->cuda_dev_compatible, ncompat);
-    memcpy(gpu_opt->cuda_dev_compatible, compat, ncompat*sizeof(*compat));
+    gpu_opt->n_dev_compatible = ncompat;
+    snew(gpu_opt->dev_compatible, ncompat);
+    memcpy(gpu_opt->dev_compatible, compat, ncompat*sizeof(*compat));
      sfree(compat);
  }
  
-/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
- *
- * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
- * gpu_opt->cuda_dev_use check the existence and compatibility
- * of the respective GPUs. Also provide the caller with an array containing
- * the result of checks in \checkres.
- *
- * \param[out]  checkres    check result for each ID passed in \requested_devs
- * \param[in]   gpu_info    pointer to structure holding GPU information
- * \param[out]  gpu_opt     pointer to structure holding GPU options
- * \returns                 TRUE if every the requested GPUs are compatible
- */
-gmx_bool check_selected_cuda_gpus(int                  *checkres,
-                                  const gmx_gpu_info_t *gpu_info,
-                                  gmx_gpu_opt_t        *gpu_opt)
+gmx_bool check_selected_gpus(int                  *checkres,
+                             const gmx_gpu_info_t *gpu_info,
+                             gmx_gpu_opt_t        *gpu_opt)
  {
      int  i, id;
      bool bAllOk;
  
      assert(checkres);
      assert(gpu_info);
-    assert(gpu_opt->ncuda_dev_use >= 0);
+    assert(gpu_opt->n_dev_use >= 0);
  
-    if (gpu_opt->ncuda_dev_use == 0)
+    if (gpu_opt->n_dev_use == 0)
      {
          return TRUE;
      }
  
-    assert(gpu_opt->cuda_dev_use);
+    assert(gpu_opt->dev_use);
  
      /* we will assume that all GPUs requested are valid IDs,
         otherwise we'll bail anyways */
  
      bAllOk = true;
-    for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
+    for (i = 0; i < gpu_opt->n_dev_use; i++)
      {
-        id = gpu_opt->cuda_dev_use[i];
+        id = gpu_opt->dev_use[i];
  
-        /* devices are stored in increasing order of IDs in cuda_dev */
-        gpu_opt->cuda_dev_use[i] = id;
+        /* devices are stored in increasing order of IDs in gpu_dev */
+        gpu_opt->dev_use[i] = id;
  
-        checkres[i] = (id >= gpu_info->ncuda_dev) ?
-            egpuNonexistent : gpu_info->cuda_dev[id].stat;
+        checkres[i] = (id >= gpu_info->n_dev) ?
+            egpuNonexistent : gpu_info->gpu_dev[id].stat;
  
          bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
      }
@@ -723,10 +665,6 @@ gmx_bool check_selected_cuda_gpus(int                  *checkres,
      return bAllOk;
  }
  
-/*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
- *
- * \param[in]    gpu_info    pointer to structure holding GPU information
- */
  void free_gpu_info(const gmx_gpu_info_t *gpu_info)
  {
      if (gpu_info == NULL)
@@ -734,32 +672,22 @@ void free_gpu_info(const gmx_gpu_info_t *gpu_info)
          return;
      }
  
-    sfree(gpu_info->cuda_dev);
+    sfree(gpu_info->gpu_dev);
  }
  
-/*! \brief Formats and returns a device information string for a given GPU.
- *
- * Given an index *directly* into the array of available GPUs (cuda_dev)
- * returns a formatted info string for the respective GPU which includes
- * ID, name, compute capability, and detection status.
- *
- * \param[out]  s           pointer to output string (has to be allocated externally)
- * \param[in]   gpu_info    pointer to structure holding GPU information
- * \param[in]   index       an index *directly* into the array of available GPUs
- */
  void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
  {
      assert(s);
      assert(gpu_info);
  
-    if (index < 0 && index >= gpu_info->ncuda_dev)
+    if (index < 0 && index >= gpu_info->n_dev)
      {
          return;
      }
  
-    cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
+    gmx_device_info_t *dinfo = &gpu_info->gpu_dev[index];
  
-    bool             bGpuExists =
+    bool               bGpuExists =
          dinfo->stat == egpuCompatible ||
          dinfo->stat == egpuIncompatible;
  
@@ -779,36 +707,18 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int ind
      }
  }
  
-/*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
- *
- * Getter function which, given an index into the array of GPUs in use
- * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
- * respective CUDA GPU.
- *
- * \param[in]    gpu_info   pointer to structure holding GPU information
- * \param[in]    gpu_opt    pointer to structure holding GPU options
- * \param[in]    idx        index into the array of used GPUs
- * \returns                 device ID of the requested GPU
- */
-int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
-                      const gmx_gpu_opt_t  *gpu_opt,
-                      int                   idx)
+int get_cuda_gpu_device_id(const gmx_gpu_info_t *gpu_info,
+                           const gmx_gpu_opt_t  *gpu_opt,
+                           int                   idx)
  {
      assert(gpu_info);
      assert(gpu_opt);
-    assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
+    assert(idx >= 0 && idx < gpu_opt->n_dev_use);
  
-    return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
+    return gpu_info->gpu_dev[gpu_opt->dev_use[idx]].id;
  }
  
-/*! \brief Returns the device ID of the GPU currently in use.
- *
- * The GPU used is the one that is active at the time of the call in the active context.
- *
- * \param[in]    gpu_info   pointer to structure holding GPU information
- * \returns                 device ID of the GPU in use at the time of the call
- */
-int get_current_gpu_device_id(void)
+int get_current_cuda_gpu_device_id(void)
  {
      int gpuid;
      CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
@@ -816,13 +726,23 @@ int get_current_gpu_device_id(void)
      return gpuid;
  }
  
-/*! \brief Returns the size of the cuda_dev_info struct.
- *
- * The size of cuda_dev_info can be used for allocation and communication.
- *
- * \returns                 size in bytes of cuda_dev_info
- */
-size_t sizeof_cuda_dev_info(void)
+size_t sizeof_gpu_dev_info(void)
+{
+    return sizeof(gmx_device_info_t);
+}
+
+void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
+                                  gmx_host_alloc_t **nb_alloc,
+                                  gmx_host_free_t  **nb_free)
  {
-    return sizeof(cuda_dev_info);
+    if (bUseGpuKernels)
+    {
+        *nb_alloc = &pmalloc;
+        *nb_free  = &pfree;
+    }
+    else
+    {
+        *nb_alloc = NULL;
+        *nb_free  = NULL;
+    }
  }
diff --git a/src/gromacs/gmxlib/gpu_utils/gpu_utils.h b/src/gromacs/gmxlib/gpu_utils/gpu_utils.h

index 46468b0180d7a5032ce57e910b2a8b8136ec452a..f11d67a4fb83d43ecac222e1837be484a01ba6d8 100644 (file)
--- a/src/gromacs/gmxlib/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gmxlib/gpu_utils/gpu_utils.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2010, The GROMACS development team.
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -34,77 +34,193 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \libinternal \file
+ *  \brief Declare functions for detection and initialization for GPU devices.
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *
+ *  \inlibraryapi
+ */
  
-#ifndef _GPU_UTILS_H_
-#define _GPU_UTILS_H_
-
-#include "config.h"
+#ifndef GMX_GMXLIB_GPU_UTILS_GPU_UTILS_H
+#define GMX_GMXLIB_GPU_UTILS_GPU_UTILS_H
  
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
  #include "gromacs/legacyheaders/types/hw_info.h"
  #include "gromacs/legacyheaders/types/simple.h"
  
-#ifdef GMX_GPU
-#define FUNC_TERM_INT ;
-#define FUNC_TERM_SIZE_T ;
-#define FUNC_TERM_VOID ;
-#define FUNC_QUALIFIER
-#else
-#define FUNC_TERM_INT {return -1; }
-#define FUNC_TERM_SIZE_T {return 0; }
-#define FUNC_TERM_VOID {}
-#define FUNC_QUALIFIER static
-#endif
-
  #ifdef __cplusplus
  extern "C" {
  #endif
  
-FUNC_QUALIFIER
-int detect_cuda_gpus(gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) FUNC_TERM_INT
+struct gmx_gpu_info_t;
  
-FUNC_QUALIFIER
-void pick_compatible_gpus(const gmx_gpu_info_t gmx_unused *gpu_info,
-                          gmx_gpu_opt_t gmx_unused        *gpu_opt) FUNC_TERM_VOID
+/*! \brief Detect all GPUs in the system.
+ *
+ *  Will detect every GPU supported by the device driver in use. Also
+ *  check for the compatibility of each and fill the gpu_info->gpu_dev array
+ *  with the required information on each the device: ID, device properties,
+ *  status.
+ *
+ *  \param[in] gpu_info    pointer to structure holding GPU information.
+ *  \param[out] err_str    The error message of any GPU API error that caused
+ *                         the detection to fail (if there was any). The memory
+ *                         the pointer points to should be managed externally.
+ *  \returns               non-zero if the detection encountered a failure, zero otherwise.
+ */
+GPU_FUNC_QUALIFIER
+int detect_gpus(struct gmx_gpu_info_t gmx_unused *gpu_info, char gmx_unused *err_str) GPU_FUNC_TERM_WITH_RETURN(-1)
  
-FUNC_QUALIFIER
-gmx_bool check_selected_cuda_gpus(int gmx_unused                  *checkres,
-                                  const gmx_gpu_info_t gmx_unused *gpu_info,
-                                  gmx_gpu_opt_t gmx_unused        *gpu_opt) FUNC_TERM_INT
+/*! \brief Select the compatible GPUs
+ *
+ * This function selects the compatible gpus and initializes
+ * gpu_info->dev_use and gpu_info->n_dev_use.
+ *
+ * Given the list of GPUs available in the system check each device in
+ * gpu_info->gpu_dev and place the indices of the compatible GPUs into
+ * dev_use with this marking the respective GPUs as "available for use."
+ * Note that \p detect_gpus must have been called before.
+ *
+ * \param[in]     gpu_info    pointer to structure holding GPU information
+ * \param[in,out] gpu_opt     pointer to structure holding GPU options
+ */
+GPU_FUNC_QUALIFIER
+void pick_compatible_gpus(const struct gmx_gpu_info_t gmx_unused *gpu_info,
+                          gmx_gpu_opt_t gmx_unused               *gpu_opt) GPU_FUNC_TERM
  
-FUNC_QUALIFIER
-void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info) FUNC_TERM_VOID
+/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
+ *
+ * Given the a list of gpu_opt->n_dev_use GPU device IDs stored in
+ * gpu_opt->dev_use check the existence and compatibility
+ * of the respective GPUs. Also provide the caller with an array containing
+ * the result of checks in \p checkres.
+ *
+ * \param[out]  checkres    check result for each ID passed in requested_devs
+ * \param[in]   gpu_info    pointer to structure holding GPU information
+ * \param[out]  gpu_opt     pointer to structure holding GPU options
+ * \returns                 TRUE if every the requested GPUs are compatible
+ */
+GPU_FUNC_QUALIFIER
+gmx_bool check_selected_gpus(int gmx_unused                         *checkres,
+                             const struct gmx_gpu_info_t gmx_unused *gpu_info,
+                             gmx_gpu_opt_t gmx_unused               *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
  
-FUNC_QUALIFIER
-gmx_bool init_gpu(FILE gmx_unused *fplog, int gmx_unused mygpu, char gmx_unused *result_str,
-                  const gmx_gpu_info_t gmx_unused *gpu_info,
-                  const gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
+/*! \brief Frees the gpu_dev and dev_use array fields of \p gpu_info.
+ *
+ * \param[in]    gpu_info    pointer to structure holding GPU information
+ */
+GPU_FUNC_QUALIFIER
+void free_gpu_info(const struct gmx_gpu_info_t gmx_unused *gpu_info) GPU_FUNC_TERM
  
-FUNC_QUALIFIER
-gmx_bool free_gpu(int gmx_unused mygpu, char gmx_unused *result_str,
-                  const gmx_gpu_info_t gmx_unused *gpu_info,
-                  const gmx_gpu_opt_t gmx_unused *gpu_opt) FUNC_TERM_INT
+/*! \brief Initializes the GPU with the given index.
+ *
+ * The varible \p mygpu is the index of the GPU to initialize in the
+ * gpu_info.gpu_dev array.
+ *
+ * \param[out] fplog        log file to write to
+ * \param[in]  mygpu        index of the GPU to initialize
+ * \param[out] result_str   the message related to the error that occurred
+ *                          during the initialization (if there was any).
+ * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \param[in] gpu_opt       options for using the GPUs in gpu_info
+ * \returns                 true if no error occurs during initialization.
+ */
+GPU_FUNC_QUALIFIER
+gmx_bool init_gpu(FILE gmx_unused                        *fplog,
+                  int gmx_unused                          mygpu,
+                  char gmx_unused                        *result_str,
+                  const struct gmx_gpu_info_t gmx_unused *gpu_info,
+                  const gmx_gpu_opt_t gmx_unused         *gpu_opt) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
+ *
+ * The context is explicitly destroyed and therefore all data uploaded to the GPU
+ * is lost. This should only be called when none of this data is required anymore.
+ *
+ * \param[in]  mygpu        index of the GPU clean up for
+ * \param[out] result_str   the message related to the error that occurred
+ *                          during the initialization (if there was any).
+ * \param[in] gpu_info      GPU info of all detected devices in the system.
+ * \param[in] gpu_opt       options for using the GPUs in gpu_info
+ * \returns                 true if no error occurs during the freeing.
+ */
+CUDA_FUNC_QUALIFIER
+gmx_bool free_cuda_gpu(int gmx_unused                   mygpu,
+                       char gmx_unused                 *result_str,
+                       const gmx_gpu_info_t gmx_unused *gpu_info,
+                       const gmx_gpu_opt_t gmx_unused  *gpu_opt) CUDA_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the device ID of the CUDA GPU currently in use.
+ *
+ * The GPU used is the one that is active at the time of the call in the active context.
+ *
+ * \returns                 device ID of the GPU in use at the time of the call
+ */
+CUDA_FUNC_QUALIFIER
+int get_current_cuda_gpu_device_id(void) CUDA_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Returns the device ID of the CUDA GPU with a given index into the array of used GPUs.
+ *
+ * Getter function which, given an index into the array of GPUs in use
+ * (dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
+ * respective CUDA GPU.
+ *
+ * \param[in]    gpu_info   pointer to structure holding GPU information
+ * \param[in]    gpu_opt    pointer to structure holding GPU options
+ * \param[in]    index      index into the array of used GPUs
+ * \returns                 device ID of the requested GPU
+ */
+CUDA_FUNC_QUALIFIER
+int get_cuda_gpu_device_id(const struct gmx_gpu_info_t gmx_unused *gpu_info,
+                           const gmx_gpu_opt_t gmx_unused         *gpu_opt,
+                           int gmx_unused                          index) CUDA_FUNC_TERM_WITH_RETURN(-1)
+
+/*! \brief Formats and returns a device information string for a given GPU.
+ *
+ * Given an index *directly* into the array of available GPUs (gpu_dev)
+ * returns a formatted info string for the respective GPU which includes
+ * ID, name, compute capability, and detection status.
+ *
+ * \param[out]  s           pointer to output string (has to be allocated externally)
+ * \param[in]   gpu_info    pointer to structure holding GPU information
+ * \param[in]   index       an index *directly* into the array of available GPUs
+ */
+GPU_FUNC_QUALIFIER
+void get_gpu_device_info_string(char gmx_unused                        *s,
+                                const struct gmx_gpu_info_t gmx_unused *gpu_info,
+                                int gmx_unused                          index) GPU_FUNC_TERM
  
-/*! \brief Returns the device ID of the GPU currently in use.*/
-FUNC_QUALIFIER
-int get_current_gpu_device_id(void) FUNC_TERM_INT
+/*! \brief Returns the size of the gpu_dev_info struct.
+ *
+ * The size of gpu_dev_info can be used for allocation and communication.
+ *
+ * \returns                 size in bytes of gpu_dev_info
+ */
+GPU_FUNC_QUALIFIER
+size_t sizeof_gpu_dev_info(void) GPU_FUNC_TERM_WITH_RETURN(0)
  
-FUNC_QUALIFIER
-int get_gpu_device_id(const gmx_gpu_info_t gmx_unused *gpu_info,
-                      const gmx_gpu_opt_t gmx_unused  *gpu_opt,
-                      int gmx_unused                   index) FUNC_TERM_INT
+/*! \brief Returns a pointer *ptr to page-locked memory of size nbytes.
+ *
+ * The allocated memory is suitable to be used for data transfers between host
+ * and GPU.
+ * Error handling should be done within this function.
+ */
+typedef void gmx_host_alloc_t (void **ptr, size_t nbytes);
  
-FUNC_QUALIFIER
-void get_gpu_device_info_string(char gmx_unused *s, const gmx_gpu_info_t gmx_unused *gpu_info, int gmx_unused index) FUNC_TERM_VOID
+/*! \brief Frees page-locked memory pointed to by *ptr.
+ *
+ * NULL should not be passed to this function.
+ */
+typedef void gmx_host_free_t (void *ptr);
  
-FUNC_QUALIFIER
-size_t sizeof_cuda_dev_info(void) FUNC_TERM_SIZE_T
+/*! \brief Set page-locked memory allocation functions used by the GPU host. */
+void gpu_set_host_malloc_and_free(bool               bUseGpuKernels,
+                                  gmx_host_alloc_t **nb_alloc,
+                                  gmx_host_free_t  **nb_free);
  
  #ifdef __cplusplus
  }
  #endif
  
-#undef FUNC_TERM_INT
-#undef FUNC_TERM_VOID
-#undef FUNC_QUALIFIER
-
-#endif /* _GPU_UTILS_H_ */
+#endif
diff --git a/src/gromacs/gmxlib/nrnb.c b/src/gromacs/gmxlib/nrnb.c

index 2a0f677a2d8bc2dddf95b1636acdfe74a39b8b36..5007d2aab3e5e4e8976d69141233369977528591 100644 (file)
--- a/src/gromacs/gmxlib/nrnb.c
+++ b/src/gromacs/gmxlib/nrnb.c
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -88,9 +88,9 @@ static const t_nrnb_data nbdata[eNRNB] = {
  
      { "Pair Search distance check",      9 }, /* nbnxn pair dist. check */
      /* nbnxn kernel flops are based on inner-loops without exclusion checks.
-     * Plain Coulomb runs through the RF kernels, except with CUDA.
+     * Plain Coulomb runs through the RF kernels, except with GPUs.
       * invsqrt is counted as 6 flops: 1 for _mm_rsqt_ps + 5 for iteration.
-     * The flops are equal for plain-C, x86 SIMD and CUDA, except for:
+     * The flops are equal for plain-C, x86 SIMD and GPUs, except for:
       * - plain-C kernel uses one flop more for Coulomb-only (F) than listed
       * - x86 SIMD LJ geom-comb.rule kernels (fastest) use 2 more flops
       * - x86 SIMD LJ LB-comb.rule kernels (fast) use 3 (8 for F+E) more flops
diff --git a/src/gromacs/legacyheaders/force.h b/src/gromacs/legacyheaders/force.h

index ca19eae719da3d070aa2d853434259cd0c64ae18..4014b5740e689a0a21b2729f0f2905c1e9d96d77 100644 (file)
--- a/src/gromacs/legacyheaders/force.h
+++ b/src/gromacs/legacyheaders/force.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -279,10 +279,10 @@ extern void do_force_lowlevel(t_forcerec   *fr,
                                float        *cycles_pme);
  /* Call all the force routines */
  
-void free_gpu_resources(const t_forcerec     *fr,
-                        const t_commrec      *cr,
-                        const gmx_gpu_info_t *gpu_info,
-                        const gmx_gpu_opt_t  *gpu_opt);
+void free_gpu_resources(const t_forcerec            *fr,
+                        const t_commrec             *cr,
+                        const struct gmx_gpu_info_t *gpu_info,
+                        const gmx_gpu_opt_t         *gpu_opt);
  
  #ifdef __cplusplus
  }
diff --git a/src/gromacs/legacyheaders/types/hw_info.h b/src/gromacs/legacyheaders/types/hw_info.h

index 0a8bb4913ff525542e735e4bf7884af22091bc28..842dfbe64be21a00effd89260b41183673756e93 100644 (file)
--- a/src/gromacs/legacyheaders/types/hw_info.h
+++ b/src/gromacs/legacyheaders/types/hw_info.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -37,7 +37,6 @@
  #define HWINFO_H
  
  #include "gromacs/legacyheaders/gmx_cpuid.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
  #include "gromacs/legacyheaders/types/simple.h"
  
  #ifdef __cplusplus
@@ -47,6 +46,8 @@ extern "C" {
  } /* fixes auto-indentation problems */
  #endif
  
+struct gmx_device_info_t;
+
  /* Possible results of the GPU detection/check.
   *
   * The egpuInsane value means that during the sanity checks an error
@@ -63,15 +64,15 @@ static const char * const gpu_detect_res_str[] =
      "compatible", "inexistent", "incompatible", "insane"
  };
  
-/* GPU device information -- for now with only CUDA devices.
+/* GPU device information -- for now with only CUDA devices
   * The gmx_hardware_detect module initializes it. */
-typedef struct
+struct gmx_gpu_info_t
  {
-    gmx_bool             bDetectGPUs;          /* Did we try to detect GPUs? */
-    int                  ncuda_dev;            /* total number of devices detected */
-    cuda_dev_info_ptr_t  cuda_dev;             /* devices detected in the system (per node) */
-    int                  ncuda_dev_compatible; /* number of compatible GPUs */
-} gmx_gpu_info_t;
+    gmx_bool                  bDetectGPUs;      /* Did we try to detect GPUs? */
+    int                       n_dev;            /* total number of GPU devices detected */
+    struct gmx_device_info_t *gpu_dev;          /* GPU devices detected in the system (per node) */
+    int                       n_dev_compatible; /* number of compatible GPUs */
+};
  
  /* Hardware information structure with CPU and GPU information.
   * It is initialized by gmx_detect_hardware().
@@ -79,9 +80,9 @@ typedef struct
   *       (i.e. must be able to be shared among all threads) */
  typedef struct
  {
-    gmx_gpu_info_t  gpu_info;            /* Information about GPUs detected in the system */
+    struct gmx_gpu_info_t gpu_info;      /* Information about GPUs detected in the system */
  
-    gmx_cpuid_t     cpuid_info;          /* CPUID information about CPU detected;
+    gmx_cpuid_t           cpuid_info;    /* CPUID information about CPU detected;
                                              NOTE: this will only detect the CPU thread 0 of the
                                              current process runs on. */
      int             nthreads_hw_avail;   /* Number of hardware threads available; this number
@@ -98,13 +99,13 @@ enum {
  /* GPU device selection information -- for now with only CUDA devices */
  typedef struct
  {
-    char     *gpu_id;               /* GPU id's to use, each specified as chars */
-    gmx_bool  bUserSet;             /* true if the GPUs in cuda_dev_use are manually provided by the user */
+    char     *gpu_id;           /* GPU id's to use, each specified as chars */
+    gmx_bool  bUserSet;         /* true if the GPUs in dev_use are manually provided by the user */
  
-    int       ncuda_dev_compatible; /* number of compatible GPU devices that could be used */
-    int      *cuda_dev_compatible;  /* array of compatible GPU device IDs, from which automatic selection occurs */
-    int       ncuda_dev_use;        /* number of GPU devices selected to be used, either by the user or automatically */
-    int      *cuda_dev_use;         /* array mapping from PP rank index to GPU device ID; GPU IDs can be listed multiple times when ranks share them */
+    int       n_dev_compatible; /* number of compatible GPU devices that could be used */
+    int      *dev_compatible;   /* array of compatible GPU device IDs, from which automatic selection occurs */
+    int       n_dev_use;        /* number of GPU devices selected to be used, either by the user or automatically */
+    int      *dev_use;          /* array mapping from PP rank index to GPU device ID; GPU IDs can be listed multiple times when ranks share them */
  } gmx_gpu_opt_t;
  
  /* Threading and GPU options, can be set automatically or by the user */
diff --git a/src/gromacs/math/utilities.h b/src/gromacs/math/utilities.h

index 9cc58649d9beffbd7b0d21ce6ff8563ed27905f8..467430d9eaaea76ff34057cb4c2823c08a0c0bc3 100644 (file)
--- a/src/gromacs/math/utilities.h
+++ b/src/gromacs/math/utilities.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -67,7 +67,7 @@ extern "C" {
  #define M_1_PI      0.31830988618379067154
  #endif
  
-#ifndef M_FLOAT_1_SQRTPI /* used in CUDA kernels */
+#ifndef M_FLOAT_1_SQRTPI /* used in GPU kernels */
  /* 1.0 / sqrt(M_PI) */
  #define M_FLOAT_1_SQRTPI 0.564189583547756f
  #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 193894b7cff76954b9a4d4fbce9e2eebcb16d708..3324370c237520f693319ba13e1aa7b6accaf4d9 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -40,11 +40,11 @@
  
  #include <assert.h>
  #include <math.h>
+#include <stdlib.h>
  #include <string.h>
  
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/ewald/ewald.h"
-#include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h"
  #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  #include "gromacs/legacyheaders/copyrite.h"
  #include "gromacs/legacyheaders/force.h"
@@ -63,7 +63,6 @@
  #include "gromacs/legacyheaders/txtdump.h"
  #include "gromacs/legacyheaders/typedefs.h"
  #include "gromacs/legacyheaders/types/commrec.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
  #include "gromacs/listed-forces/manage-threading.h"
  #include "gromacs/math/calculate-ewald-splitting-coefficient.h"
  #include "gromacs/math/units.h"
@@ -73,9 +72,9 @@
  #include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_atomdata.h"
  #include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/mdlib/nbnxn_search.h"
  #include "gromacs/mdlib/nbnxn_simd.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/simd/simd.h"
@@ -83,6 +82,8 @@
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/smalloc.h"
  
+#include "nbnxn_gpu_jit_support.h"
+
  t_forcerec *mk_forcerec(void)
  {
      t_forcerec *fr;
@@ -1678,7 +1679,7 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
              returnvalue = "not available";
  #endif /* GMX_NBNXN_SIMD */
              break;
-        case nbnxnk8x8x8_CUDA: returnvalue   = "CUDA"; break;
+        case nbnxnk8x8x8_GPU: returnvalue    = "GPU"; break;
          case nbnxnk8x8x8_PlainC: returnvalue = "plain C"; break;
  
          case nbnxnkNR:
@@ -1716,7 +1717,7 @@ static void pick_nbnxn_kernel(FILE                *fp,
      }
      else if (bUseGPU)
      {
-        *kernel_type = nbnxnk8x8x8_CUDA;
+        *kernel_type = nbnxnk8x8x8_GPU;
      }
  
      if (*kernel_type == nbnxnkNotSet)
@@ -1780,12 +1781,11 @@ static void pick_nbnxn_resources(FILE                *fp,
       * Note that you should freezing the system as otherwise it will explode.
       */
      *bEmulateGPU = (bEmulateGPUEnvVarSet ||
-                    (!bDoNonbonded &&
-                     gpu_opt->ncuda_dev_use > 0));
+                    (!bDoNonbonded && gpu_opt->n_dev_use > 0));
  
      /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
       */
-    if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
+    if (gpu_opt->n_dev_use > 0 && !(*bEmulateGPU))
      {
          /* Each PP node will use the intra-node id-th device from the
           * list of detected/selected GPUs. */
@@ -1794,10 +1794,13 @@ static void pick_nbnxn_resources(FILE                *fp,
          {
              /* At this point the init should never fail as we made sure that
               * we have all the GPUs we need. If it still does, we'll bail. */
+            /* TODO the decorating of gpu_err_str is nicer if it
+               happens inside init_gpu. Out here, the decorating with
+               the MPI rank makes sense. */
              gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
                        cr->nodeid,
-                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
-                                        cr->rank_pp_intranode),
+                      get_cuda_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+                                             cr->rank_pp_intranode),
                        gpu_err_str);
          }
  
@@ -1937,15 +1940,18 @@ static void potential_switch_constants(real rsw, real rc,
      sc->c5 =  -6*pow(rc - rsw, -5);
  }
  
+/*! \brief Construct interaction constants
+ *
+ * This data is used (particularly) by search and force code for
+ * short-range interactions. Many of these are constant for the whole
+ * simulation; some are constant only after PME tuning completes.
+ */
  static void
  init_interaction_const(FILE                       *fp,
-                       const t_commrec gmx_unused *cr,
                         interaction_const_t       **interaction_const,
-                       const t_forcerec           *fr,
-                       real                        rtab)
+                       const t_forcerec           *fr)
  {
      interaction_const_t *ic;
-    gmx_bool             bUsesSimpleTables = TRUE;
      const real           minusSix          = -6.0;
      const real           minusTwelve       = -12.0;
  
@@ -2069,10 +2075,19 @@ init_interaction_const(FILE                       *fp,
      }
  
      *interaction_const = ic;
+}
  
-    if (fr->nbv != NULL && fr->nbv->bUseGPU)
+/*! \brief Manage initialization within the NBNXN module of
+ * run-time constants.
+ */
+static void
+initialize_gpu_constants(const t_commrec gmx_unused      *cr,
+                         interaction_const_t             *interaction_const,
+                         const struct nonbonded_verlet_t *nbv)
+{
+    if (nbv != NULL && nbv->bUseGPU)
      {
-        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+        nbnxn_gpu_init_const(nbv->gpu_nbv, interaction_const, nbv->grp);
  
          /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
           * also sharing texture references. To keep the code simple, we don't
@@ -2084,7 +2099,7 @@ init_interaction_const(FILE                       *fp,
           *
           * Note that we could omit this barrier if GPUs are not shared (or
           * texture objects are used), but as this is initialization code, there
-         * is not point in complicating things.
+         * is no point in complicating things.
           */
  #ifdef GMX_THREAD_MPI
          if (PAR(cr))
@@ -2094,8 +2109,6 @@ init_interaction_const(FILE                       *fp,
  #endif  /* GMX_THREAD_MPI */
      }
  
-    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
-    init_interaction_const_tables(fp, ic, bUsesSimpleTables, rtab);
  }
  
  static void init_nb_verlet(FILE                *fp,
@@ -2163,12 +2176,14 @@ static void init_nb_verlet(FILE                *fp,
  
      if (nbv->bUseGPU)
      {
+        nbnxn_gpu_compile_kernels(cr->rank_pp_intranode, cr->nodeid, &fr->hwinfo->gpu_info, fr->gpu_opt, fr->ic);
+
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
-        nbnxn_cuda_init(fp, &nbv->cu_nbv,
-                        &fr->hwinfo->gpu_info, fr->gpu_opt,
-                        cr->rank_pp_intranode,
-                        (nbv->ngrp > 1) && !bHybridGPURun);
+        nbnxn_gpu_init(fp, &nbv->gpu_nbv,
+                       &fr->hwinfo->gpu_info, fr->gpu_opt,
+                       cr->rank_pp_intranode,
+                       (nbv->ngrp > 1) && !bHybridGPURun);
  
          if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
          {
@@ -2188,7 +2203,7 @@ static void init_nb_verlet(FILE                *fp,
          }
          else
          {
-            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
+            nbv->min_ci_balanced = nbnxn_gpu_min_ci_balanced(nbv->gpu_nbv);
              if (debug)
              {
                  fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
@@ -2211,16 +2226,8 @@ static void init_nb_verlet(FILE                *fp,
  
      for (i = 0; i < nbv->ngrp; i++)
      {
-        if (nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
-        {
-            nb_alloc = &pmalloc;
-            nb_free  = &pfree;
-        }
-        else
-        {
-            nb_alloc = NULL;
-            nb_free  = NULL;
-        }
+        gpu_set_host_malloc_and_free(nbv->grp[0].kernel_type == nbnxnk8x8x8_GPU,
+                                     &nb_alloc, &nb_free);
  
          nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
                                  nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
@@ -3207,6 +3214,9 @@ void init_forcerec(FILE              *fp,
  
      snew(fr->excl_load, fr->nthreads+1);
  
+    /* fr->ic is used both by verlet and group kernels (to some extent) now */
+    init_interaction_const(fp, &fr->ic, fr);
+
      if (fr->cutoff_scheme == ecutsVERLET)
      {
          if (ir->rcoulomb != ir->rvdw)
@@ -3217,8 +3227,10 @@ void init_forcerec(FILE              *fp,
          init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
      }
  
-    /* fr->ic is used both by verlet and group kernels (to some extent) now */
-    init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+    initialize_gpu_constants(cr, fr->ic, fr->nbv);
+    init_interaction_const_tables(fp, fr->ic,
+                                  uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1),
+                                  rtab);
  
      if (ir->eDispCorr != edispcNO)
      {
@@ -3293,7 +3305,7 @@ void forcerec_set_excl_load(t_forcerec           *fr,
      }
  }
  
-/* Frees GPU memory and destroys the CUDA context.
+/* Frees GPU memory and destroys the GPU context.
   *
   * Note that this function needs to be called even if GPUs are not used
   * in this run because the PME ranks have no knowledge of whether GPUs
@@ -3312,7 +3324,7 @@ void free_gpu_resources(const t_forcerec     *fr,
      if (bIsPPrankUsingGPU)
      {
          /* free nbnxn data in GPU memory */
-        nbnxn_cuda_free(fr->nbv->cu_nbv);
+        nbnxn_gpu_free(fr->nbv->gpu_nbv);
  
          /* With tMPI we need to wait for all ranks to finish deallocation before
           * destroying the context in free_gpu() as some ranks may be sharing
@@ -3328,10 +3340,10 @@ void free_gpu_resources(const t_forcerec     *fr,
  #endif  /* GMX_THREAD_MPI */
  
          /* uninitialize GPU (by destroying the context) */
-        if (!free_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
+        if (!free_cuda_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
          {
              gmx_warning("On rank %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+                        cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
          }
      }
  }
diff --git a/src/gromacs/mdlib/nb_verlet.h b/src/gromacs/mdlib/nb_verlet.h

index c435262ce7697a80ee36cd4cc699e40fa1e5d48e..67bb46cccb3725a7d180fe4b881bad8855b5ccae 100644 (file)
--- a/src/gromacs/mdlib/nb_verlet.h
+++ b/src/gromacs/mdlib/nb_verlet.h
@@ -36,7 +36,7 @@
  #ifndef NB_VERLET_H
  #define NB_VERLET_H
  
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
  #include "gromacs/mdlib/nbnxn_pairlist.h"
  
  #ifdef __cplusplus
@@ -44,14 +44,14 @@ extern "C" {
  #endif
  
  
-/** Nonbonded NxN kernel types: plain C, CPU SIMD, GPU CUDA, GPU emulation */
+/** Nonbonded NxN kernel types: plain C, CPU SIMD, GPU, GPU emulation */
  typedef enum
  {
      nbnxnkNotSet = 0,
      nbnxnk4x4_PlainC,
      nbnxnk4xN_SIMD_4xN,
      nbnxnk4xN_SIMD_2xNN,
-    nbnxnk8x8x8_CUDA,
+    nbnxnk8x8x8_GPU,
      nbnxnk8x8x8_PlainC,
      nbnxnkNR
  } nbnxn_kernel_type;
@@ -102,9 +102,9 @@ typedef struct nonbonded_verlet_t {
      nonbonded_verlet_group_t grp[2];          /* local and non-local interaction group */
  
      gmx_bool                 bUseGPU;         /* TRUE when GPU acceleration is used */
-    nbnxn_cuda_ptr_t         cu_nbv;          /* pointer to CUDA nb verlet data     */
+    gmx_nbnxn_gpu_t         *gpu_nbv;         /* pointer to GPU nb verlet data     */
      int                      min_ci_balanced; /* pair list balancing parameter
-                                                 used for the 8x8x8 CUDA kernels    */
+                                                 used for the 8x8x8 GPU kernels    */
  } nonbonded_verlet_t;
  
  /*! \brief Getter for bUseGPU */
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c

index c20dfb2e54bc18f313c3f002d2600154f25b472e..d1bc25967e4a3d515100cb2bf8f9ce267220fc92 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -132,7 +132,7 @@ void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat, int n)
                         nbat->alloc, nbat->free);
      for (t = 0; t < nbat->nout; t++)
      {
-        /* Allocate one element extra for possible signaling with CUDA */
+        /* Allocate one element extra for possible signaling with GPUs */
          nbnxn_realloc_void((void **)&nbat->out[t].f,
                             nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
                             n*nbat->fstride*sizeof(*nbat->out[t].f),
diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h

index 719e47b8f4f6c99d94b70fc2f74cbe96d9e57466..d685a453c06396e214c37299813a32012d2e7ca2 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_consts.h
+++ b/src/gromacs/mdlib/nbnxn_consts.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu

index 959b57f59cd67f9eec31f2e3d86322eec5ff6b59..596345a9756768ad7c83e95e90a2bdff3537e3db 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -32,15 +32,20 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \file
+ *  \brief Define CUDA implementation of nbnxn_gpu.h
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ */
  #include "gmxpre.h"
  
-#include "nbnxn_cuda.h"
-
  #include "config.h"
  
  #include <assert.h>
  #include <stdlib.h>
  
+#include "gromacs/mdlib/nbnxn_gpu.h"
+
  #if defined(_MSVC)
  #include <limits>
  #endif
@@ -56,9 +61,10 @@
  #include "gromacs/legacyheaders/types/simple.h"
  #include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/mdlib/nbnxn_pairlist.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
  #include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
  
  #include "nbnxn_cuda_types.h"
@@ -137,6 +143,7 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  #undef CALC_ENERGIES
  #undef PRUNE_NBL
  
+
  /*! Nonbonded kernel function pointer type */
  typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
                                       const cu_nbparam_t,
@@ -158,7 +165,7 @@ static bool always_prune = (getenv("GMX_GPU_ALWAYS_PRUNE") != NULL);
  static unsigned int poll_wait_pattern = (0x7FU << 23);
  
  /*! Returns the number of blocks to be used for the nonbonded GPU kernel. */
-static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
+static inline int calc_nb_kernel_nblock(int nwork_units, gmx_device_info_t *dinfo)
  {
      int max_grid_x_size;
  
@@ -309,10 +316,10 @@ static inline int calc_shmem_required(const int num_threads_z)
     misc_ops_done event to record the point in time when the above  operations
     are finished and synchronize with this event in the non-local stream.
   */
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
-                              const nbnxn_atomdata_t *nbatom,
-                              int                     flags,
-                              int                     iloc)
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
+                             const nbnxn_atomdata_t *nbatom,
+                             int                     flags,
+                             int                     iloc)
  {
      cudaError_t          stat;
      int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
@@ -321,15 +328,15 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
      dim3                 dim_block, dim_grid;
      nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
  
-    cu_atomdata_t       *adat    = cu_nb->atdat;
-    cu_nbparam_t        *nbp     = cu_nb->nbparam;
-    cu_plist_t          *plist   = cu_nb->plist[iloc];
-    cu_timers_t         *t       = cu_nb->timers;
-    cudaStream_t         stream  = cu_nb->stream[iloc];
+    cu_atomdata_t       *adat    = nb->atdat;
+    cu_nbparam_t        *nbp     = nb->nbparam;
+    cu_plist_t          *plist   = nb->plist[iloc];
+    cu_timers_t         *t       = nb->timers;
+    cudaStream_t         stream  = nb->stream[iloc];
  
      bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool                 bCalcFshift = flags & GMX_FORCE_VIRIAL;
-    bool                 bDoTime     = cu_nb->bDoTime;
+    bool                 bDoTime     = nb->bDoTime;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
@@ -354,16 +361,16 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
  
      /* When we get here all misc operations issues in the local stream are done,
         so we record that in the local stream and wait for it in the nonlocal one. */
-    if (cu_nb->bUseTwoStreams)
+    if (nb->bUseTwoStreams)
      {
          if (iloc == eintLocal)
          {
-            stat = cudaEventRecord(cu_nb->misc_ops_done, stream);
+            stat = cudaEventRecord(nb->misc_ops_done, stream);
              CU_RET_ERR(stat, "cudaEventRecord on misc_ops_done failed");
          }
          else
          {
-            stat = cudaStreamWaitEvent(stream, cu_nb->misc_ops_done, 0);
+            stat = cudaStreamWaitEvent(stream, nb->misc_ops_done, 0);
              CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_done failed");
          }
      }
@@ -404,11 +411,11 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
      int num_threads_z = 1;
-    if (cu_nb->dev_info->prop.major == 3 && cu_nb->dev_info->prop.minor == 7)
+    if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
      {
          num_threads_z = 2;
      }
-    nblock    = calc_nb_kernel_nblock(plist->nsci, cu_nb->dev_info);
+    nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
      dim_block = dim3(CL_SIZE, CL_SIZE, num_threads_z);
      dim_grid  = dim3(nblock, 1, 1);
      shmem     = calc_shmem_required(num_threads_z);
@@ -434,10 +441,10 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
      }
  }
  
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
-                               const nbnxn_atomdata_t *nbatom,
-                               int                     flags,
-                               int                     aloc)
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_cuda_t       *nb,
+                              const nbnxn_atomdata_t *nbatom,
+                              int                     flags,
+                              int                     aloc)
  {
      cudaError_t stat;
      int         adat_begin, adat_len, adat_end; /* local/nonlocal offset and length used for xq and f */
@@ -460,16 +467,16 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
          gmx_incons(stmp);
      }
  
-    cu_atomdata_t   *adat    = cu_nb->atdat;
-    cu_timers_t     *t       = cu_nb->timers;
-    bool             bDoTime = cu_nb->bDoTime;
-    cudaStream_t     stream  = cu_nb->stream[iloc];
+    cu_atomdata_t   *adat    = nb->atdat;
+    cu_timers_t     *t       = nb->timers;
+    bool             bDoTime = nb->bDoTime;
+    cudaStream_t     stream  = nb->stream[iloc];
  
      bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
      bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* don't launch copy-back if there was no work to do */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    if (nb->plist[iloc]->nsci == 0)
      {
          return;
      }
@@ -479,13 +486,13 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
      {
          adat_begin  = 0;
          adat_len    = adat->natoms_local;
-        adat_end    = cu_nb->atdat->natoms_local;
+        adat_end    = nb->atdat->natoms_local;
      }
      else
      {
          adat_begin  = adat->natoms_local;
          adat_len    = adat->natoms - adat->natoms_local;
-        adat_end    = cu_nb->atdat->natoms;
+        adat_end    = nb->atdat->natoms;
      }
  
      /* beginning of timed D2H section */
@@ -495,7 +502,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
          CU_RET_ERR(stat, "cudaEventRecord failed");
      }
  
-    if (!cu_nb->bUseStreamSync)
+    if (!nb->bUseStreamSync)
      {
          /* For safety reasons set a few (5%) forces to NaN. This way even if the
             polling "hack" fails with some future NVIDIA driver we'll get a crash. */
@@ -525,9 +532,9 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
  
      /* With DD the local D2H transfer can only start after the non-local
         has been launched. */
-    if (iloc == eintLocal && cu_nb->bUseTwoStreams)
+    if (iloc == eintLocal && nb->bUseTwoStreams)
      {
-        stat = cudaStreamWaitEvent(stream, cu_nb->nonlocal_done, 0);
+        stat = cudaStreamWaitEvent(stream, nb->nonlocal_done, 0);
          CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
      }
  
@@ -541,7 +548,7 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
         data back first. */
      if (iloc == eintNonlocal)
      {
-        stat = cudaEventRecord(cu_nb->nonlocal_done, stream);
+        stat = cudaEventRecord(nb->nonlocal_done, stream);
          CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
      }
  
@@ -551,17 +558,17 @@ void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t        cu_nb,
          /* DtoH fshift */
          if (bCalcFshift)
          {
-            cu_copy_D2H_async(cu_nb->nbst.fshift, adat->fshift,
-                              SHIFTS * sizeof(*cu_nb->nbst.fshift), stream);
+            cu_copy_D2H_async(nb->nbst.fshift, adat->fshift,
+                              SHIFTS * sizeof(*nb->nbst.fshift), stream);
          }
  
          /* DtoH energies */
          if (bCalcEner)
          {
-            cu_copy_D2H_async(cu_nb->nbst.e_lj, adat->e_lj,
-                              sizeof(*cu_nb->nbst.e_lj), stream);
-            cu_copy_D2H_async(cu_nb->nbst.e_el, adat->e_el,
-                              sizeof(*cu_nb->nbst.e_el), stream);
+            cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj,
+                              sizeof(*nb->nbst.e_lj), stream);
+            cu_copy_D2H_async(nb->nbst.e_el, adat->e_el,
+                              sizeof(*nb->nbst.e_el), stream);
          }
      }
  
@@ -589,10 +596,10 @@ static inline bool atomic_cas(volatile unsigned int *ptr,
  #endif
  }
  
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
-                         const nbnxn_atomdata_t *nbatom,
-                         int flags, int aloc,
-                         real *e_lj, real *e_el, rvec *fshift)
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
+                            const nbnxn_atomdata_t *nbatom,
+                            int flags, int aloc,
+                            real *e_lj, real *e_el, rvec *fshift)
  {
      /* NOTE:  only implemented for single-precision at this time */
      cudaError_t            stat;
@@ -616,13 +623,13 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
          gmx_incons(stmp);
      }
  
-    cu_plist_t      *plist    = cu_nb->plist[iloc];
-    cu_timers_t     *timers   = cu_nb->timers;
-    wallclock_gpu_t *timings  = cu_nb->timings;
-    nb_staging       nbst     = cu_nb->nbst;
+    cu_plist_t                 *plist    = nb->plist[iloc];
+    cu_timers_t                *timers   = nb->timers;
+    struct gmx_wallclock_gpu_t *timings  = nb->timings;
+    nb_staging                  nbst     = nb->nbst;
  
-    bool             bCalcEner   = flags & GMX_FORCE_ENERGY;
-    bool             bCalcFshift = flags & GMX_FORCE_VIRIAL;
+    bool                        bCalcEner   = flags & GMX_FORCE_ENERGY;
+    bool                        bCalcFshift = flags & GMX_FORCE_VIRIAL;
  
      /* turn energy calculation always on/off (for debugging/testing only) */
      bCalcEner = (bCalcEner || always_ener) && !never_ener;
@@ -632,7 +639,7 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
         NOTE: if timing with multiple GPUs (streams) becomes possible, the
         counters could end up being inconsistent due to not being incremented
         on some of the nodes! */
-    if (cu_nb->plist[iloc]->nsci == 0)
+    if (nb->plist[iloc]->nsci == 0)
      {
          return;
      }
@@ -640,16 +647,16 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
      /* calculate the atom data index range based on locality */
      if (LOCAL_A(aloc))
      {
-        adat_end = cu_nb->atdat->natoms_local;
+        adat_end = nb->atdat->natoms_local;
      }
      else
      {
-        adat_end = cu_nb->atdat->natoms;
+        adat_end = nb->atdat->natoms;
      }
  
-    if (cu_nb->bUseStreamSync)
+    if (nb->bUseStreamSync)
      {
-        stat = cudaStreamSynchronize(cu_nb->stream[iloc]);
+        stat = cudaStreamSynchronize(nb->stream[iloc]);
          CU_RET_ERR(stat, "cudaStreamSynchronize failed in cu_blockwait_nb");
      }
      else
@@ -667,7 +674,7 @@ void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t cu_nb,
      }
  
      /* timing data accumulation */
-    if (cu_nb->bDoTime)
+    if (nb->bDoTime)
      {
          /* only increase counter once (at local F wait) */
          if (LOCAL_I(iloc))
@@ -746,7 +753,7 @@ const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_
  
  /*! Set up the cache configuration for the non-bonded kernels,
   */
-void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
+void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo)
  {
      cudaError_t stat;
  
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu

index efefc77f416233a3474fc85295c9c365e5913f2e..1cd9b4aa1368fa98d69f19825594c39b771df7cb 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -32,10 +32,13 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
+/*! \file
+ *  \brief Define CUDA implementation of nbnxn_gpu_data_mgmt.h
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ */
  #include "gmxpre.h"
  
-#include "nbnxn_cuda_data_mgmt.h"
-
  #include "config.h"
  
  #include <assert.h>
@@ -56,7 +59,9 @@
  #include "gromacs/legacyheaders/types/interaction_const.h"
  #include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
@@ -73,7 +78,7 @@ static bool bUseCudaEventBlockingSync = false; /* makes the CPU thread block */
  static unsigned int gpu_min_ci_balanced_factor = 40;
  
  /* Functions from nbnxn_cuda.cu */
-extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
+extern void nbnxn_cuda_set_cacheconfig(gmx_device_info_t *devinfo);
  extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
  extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
  extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
@@ -107,15 +112,15 @@ static void md_print_warn(FILE       *fplog,
  
  
  /* Fw. decl. */
-static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb);
+static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb);
  
  
  /*! Tabulates the Ewald Coulomb force and initializes the size/scale
      and the table GPU array. If called with an already allocated table,
      it just re-uploads the table.
   */
-static void init_ewald_coulomb_force_table(cu_nbparam_t          *nbp,
-                                           const cuda_dev_info_t *dev_info)
+static void init_ewald_coulomb_force_table(cu_nbparam_t            *nbp,
+                                           const gmx_device_info_t *dev_info)
  {
      float       *ftmp, *coul_tab;
      int          tabsize;
@@ -211,8 +216,8 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
  
  /*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
      earlier GPUs, single or twin cut-off. */
-static int pick_ewald_kernel_type(bool                   bTwinCut,
-                                  const cuda_dev_info_t *dev_info)
+static int pick_ewald_kernel_type(bool                     bTwinCut,
+                                  const gmx_device_info_t *dev_info)
  {
      bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
      int  kernel_type;
@@ -288,7 +293,7 @@ static void set_cutoff_parameters(cu_nbparam_t              *nbp,
  static void init_nbparam(cu_nbparam_t              *nbp,
                           const interaction_const_t *ic,
                           const nbnxn_atomdata_t    *nbat,
-                         const cuda_dev_info_t     *dev_info)
+                         const gmx_device_info_t   *dev_info)
  {
      cudaError_t stat;
      int         ntypes, nnbfp, nnbfp_comb;
@@ -428,29 +433,29 @@ static void init_nbparam(cu_nbparam_t              *nbp,
  
  /*! Re-generate the GPU Ewald force table, resets rlist, and update the
   *  electrostatic type switching to twin cut-off (or back) if needed. */
-void nbnxn_cuda_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
-                                         const interaction_const_t   *ic)
+void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
+                                        const interaction_const_t   *ic)
  {
-    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_CUDA)
+    if (!nbv || nbv->grp[0].kernel_type != nbnxnk8x8x8_GPU)
      {
          return;
      }
-    nbnxn_cuda_ptr_t cu_nb = nbv->cu_nbv;
-    cu_nbparam_t    *nbp   = cu_nb->nbparam;
+    gmx_nbnxn_cuda_t *nb    = nbv->gpu_nbv;
+    cu_nbparam_t     *nbp   = nb->nbparam;
  
      set_cutoff_parameters(nbp, ic);
  
      nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
-                                                 cu_nb->dev_info);
+                                                 nb->dev_info);
  
-    init_ewald_coulomb_force_table(cu_nb->nbparam, cu_nb->dev_info);
+    init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info);
  }
  
  /*! Initializes the pair list data structure. */
  static void init_plist(cu_plist_t *pl)
  {
      /* initialize to NULL pointers to data that is not allocated here and will
-       need reallocation in nbnxn_cuda_init_pairlist */
+       need reallocation in nbnxn_gpu_init_pairlist */
      pl->sci     = NULL;
      pl->cj4     = NULL;
      pl->excl    = NULL;
@@ -504,7 +509,7 @@ static void init_timers(cu_timers_t *t, bool bUseTwoStreams)
  }
  
  /*! Initializes the timings data structure. */
-static void init_timings(wallclock_gpu_t *t)
+static void init_timings(gmx_wallclock_gpu_t *t)
  {
      int i, j;
  
@@ -523,22 +528,22 @@ static void init_timings(wallclock_gpu_t *t)
      }
  }
  
-void nbnxn_cuda_init(FILE                 *fplog,
-                     nbnxn_cuda_ptr_t     *p_cu_nb,
-                     const gmx_gpu_info_t *gpu_info,
-                     const gmx_gpu_opt_t  *gpu_opt,
-                     int                   my_gpu_index,
-                     gmx_bool              bLocalAndNonlocal)
+void nbnxn_gpu_init(FILE                 *fplog,
+                    gmx_nbnxn_cuda_t    **p_nb,
+                    const gmx_gpu_info_t *gpu_info,
+                    const gmx_gpu_opt_t  *gpu_opt,
+                    int                   my_gpu_index,
+                    gmx_bool              bLocalAndNonlocal)
  {
      cudaError_t       stat;
-    nbnxn_cuda_ptr_t  nb;
+    gmx_nbnxn_cuda_t *nb;
      char              sbuf[STRLEN];
      bool              bStreamSync, bNoStreamSync, bTMPIAtomics, bX86, bOldDriver;
      int               cuda_drv_ver;
  
      assert(gpu_info);
  
-    if (p_cu_nb == NULL)
+    if (p_nb == NULL)
      {
          return;
      }
@@ -565,7 +570,7 @@ void nbnxn_cuda_init(FILE                 *fplog,
      init_plist(nb->plist[eintLocal]);
  
      /* set device info, just point it to the right GPU among the detected ones */
-    nb->dev_info = &gpu_info->cuda_dev[get_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
+    nb->dev_info = &gpu_info->gpu_dev[get_cuda_gpu_device_id(gpu_info, gpu_opt, my_gpu_index)];
  
      /* local/non-local GPU streams */
      stat = cudaStreamCreate(&nb->stream[eintLocal]);
@@ -726,7 +731,7 @@ void nbnxn_cuda_init(FILE                 *fplog,
      /* pick L1 cache configuration */
      nbnxn_cuda_set_cacheconfig(nb->dev_info);
  
-    *p_cu_nb = nb;
+    *p_nb = nb;
  
      if (debug)
      {
@@ -734,26 +739,26 @@ void nbnxn_cuda_init(FILE                 *fplog,
      }
  }
  
-void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t                cu_nb,
-                           const interaction_const_t      *ic,
-                           const nonbonded_verlet_group_t *nbv_group)
+void nbnxn_gpu_init_const(gmx_nbnxn_cuda_t               *nb,
+                          const interaction_const_t      *ic,
+                          const nonbonded_verlet_group_t *nbv_group)
  {
-    init_atomdata_first(cu_nb->atdat, nbv_group[0].nbat->ntype);
-    init_nbparam(cu_nb->nbparam, ic, nbv_group[0].nbat, cu_nb->dev_info);
+    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype);
+    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
  
      /* clear energy and shift force outputs */
-    nbnxn_cuda_clear_e_fshift(cu_nb);
+    nbnxn_cuda_clear_e_fshift(nb);
  }
  
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
-                              const nbnxn_pairlist_t *h_plist,
-                              int                     iloc)
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_cuda_t       *nb,
+                             const nbnxn_pairlist_t *h_plist,
+                             int                     iloc)
  {
      char          sbuf[STRLEN];
      cudaError_t   stat;
-    bool          bDoTime    = cu_nb->bDoTime;
-    cudaStream_t  stream     = cu_nb->stream[iloc];
-    cu_plist_t   *d_plist    = cu_nb->plist[iloc];
+    bool          bDoTime    = nb->bDoTime;
+    cudaStream_t  stream     = nb->stream[iloc];
+    cu_plist_t   *d_plist    = nb->plist[iloc];
  
      if (d_plist->na_c < 0)
      {
@@ -771,7 +776,7 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
  
      if (bDoTime)
      {
-        stat = cudaEventRecord(cu_nb->timers->start_pl_h2d[iloc], stream);
+        stat = cudaEventRecord(nb->timers->start_pl_h2d[iloc], stream);
          CU_RET_ERR(stat, "cudaEventRecord failed");
      }
  
@@ -792,7 +797,7 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
  
      if (bDoTime)
      {
-        stat = cudaEventRecord(cu_nb->timers->stop_pl_h2d[iloc], stream);
+        stat = cudaEventRecord(nb->timers->stop_pl_h2d[iloc], stream);
          CU_RET_ERR(stat, "cudaEventRecord failed");
      }
  
@@ -800,11 +805,11 @@ void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t        cu_nb,
      d_plist->bDoPrune = true;
  }
  
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
-                                const nbnxn_atomdata_t *nbatom)
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_cuda_t       *nb,
+                               const nbnxn_atomdata_t *nbatom)
  {
-    cu_atomdata_t *adat  = cu_nb->atdat;
-    cudaStream_t   ls    = cu_nb->stream[eintLocal];
+    cu_atomdata_t *adat  = nb->atdat;
+    cudaStream_t   ls    = nb->stream[eintLocal];
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -816,22 +821,22 @@ void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t        cu_nb,
  }
  
  /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
-static void nbnxn_cuda_clear_f(nbnxn_cuda_ptr_t cu_nb, int natoms_clear)
+static void nbnxn_cuda_clear_f(gmx_nbnxn_cuda_t *nb, int natoms_clear)
  {
      cudaError_t    stat;
-    cu_atomdata_t *adat  = cu_nb->atdat;
-    cudaStream_t   ls    = cu_nb->stream[eintLocal];
+    cu_atomdata_t *adat  = nb->atdat;
+    cudaStream_t   ls    = nb->stream[eintLocal];
  
      stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
  }
  
  /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
-static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
+static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb)
  {
      cudaError_t    stat;
-    cu_atomdata_t *adat  = cu_nb->atdat;
-    cudaStream_t   ls    = cu_nb->stream[eintLocal];
+    cu_atomdata_t *adat  = nb->atdat;
+    cudaStream_t   ls    = nb->stream[eintLocal];
  
      stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -841,27 +846,27 @@ static void nbnxn_cuda_clear_e_fshift(nbnxn_cuda_ptr_t cu_nb)
      CU_RET_ERR(stat, "cudaMemsetAsync on e_el falied");
  }
  
-void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t cu_nb, int flags)
+void nbnxn_gpu_clear_outputs(gmx_nbnxn_cuda_t *nb, int flags)
  {
-    nbnxn_cuda_clear_f(cu_nb, cu_nb->atdat->natoms);
+    nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
      /* clear shift force array and energies if the outputs were
         used in the current step */
      if (flags & GMX_FORCE_VIRIAL)
      {
-        nbnxn_cuda_clear_e_fshift(cu_nb);
+        nbnxn_cuda_clear_e_fshift(nb);
      }
  }
  
-void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
-                              const nbnxn_atomdata_t *nbat)
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t              *nb,
+                             const struct nbnxn_atomdata_t *nbat)
  {
      cudaError_t    stat;
      int            nalloc, natoms;
      bool           realloced;
-    bool           bDoTime   = cu_nb->bDoTime;
-    cu_timers_t   *timers    = cu_nb->timers;
-    cu_atomdata_t *d_atdat   = cu_nb->atdat;
-    cudaStream_t   ls        = cu_nb->stream[eintLocal];
+    bool           bDoTime   = nb->bDoTime;
+    cu_timers_t   *timers    = nb->timers;
+    cu_atomdata_t *d_atdat   = nb->atdat;
+    cudaStream_t   ls        = nb->stream[eintLocal];
  
      natoms    = nbat->natoms;
      realloced = false;
@@ -905,7 +910,7 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
      /* need to clear GPU f output if realloc happened */
      if (realloced)
      {
-        nbnxn_cuda_clear_f(cu_nb, nalloc);
+        nbnxn_cuda_clear_f(nb, nalloc);
      }
  
      cu_copy_H2D_async(d_atdat->atom_types, nbat->type,
@@ -918,7 +923,7 @@ void nbnxn_cuda_init_atomdata(nbnxn_cuda_ptr_t        cu_nb,
      }
  }
  
-void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
+void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
  {
      cudaError_t      stat;
      cu_atomdata_t   *atdat;
@@ -934,23 +939,23 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
          CU_RET_ERR(stat, "cudaProfilerStop failed");
      }
  
-    if (cu_nb == NULL)
+    if (nb == NULL)
      {
          return;
      }
  
-    atdat       = cu_nb->atdat;
-    nbparam     = cu_nb->nbparam;
-    plist       = cu_nb->plist[eintLocal];
-    plist_nl    = cu_nb->plist[eintNonlocal];
-    timers      = cu_nb->timers;
+    atdat       = nb->atdat;
+    nbparam     = nb->nbparam;
+    plist       = nb->plist[eintLocal];
+    plist_nl    = nb->plist[eintNonlocal];
+    timers      = nb->timers;
  
      if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
      {
  
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
          /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (cu_nb->dev_info->prop.major >= 3)
+        if (nb->dev_info->prop.major >= 3)
          {
              stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
              CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
@@ -964,12 +969,12 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
          cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
      }
  
-    stat = cudaEventDestroy(cu_nb->nonlocal_done);
+    stat = cudaEventDestroy(nb->nonlocal_done);
      CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
-    stat = cudaEventDestroy(cu_nb->misc_ops_done);
+    stat = cudaEventDestroy(nb->misc_ops_done);
      CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_done");
  
-    if (cu_nb->bDoTime)
+    if (nb->bDoTime)
      {
          stat = cudaEventDestroy(timers->start_atdat);
          CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_atdat");
@@ -977,7 +982,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
          CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_atdat");
  
          /* The non-local counters/stream (second in the array) are needed only with DD. */
-        for (int i = 0; i <= (cu_nb->bUseTwoStreams ? 1 : 0); i++)
+        for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
          {
              stat = cudaEventDestroy(timers->start_nb_k[i]);
              CU_RET_ERR(stat, "cudaEventDestroy failed on timers->start_nb_k");
@@ -989,7 +994,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
              stat = cudaEventDestroy(timers->stop_pl_h2d[i]);
              CU_RET_ERR(stat, "cudaEventDestroy failed on timers->stop_pl_h2d");
  
-            stat = cudaStreamDestroy(cu_nb->stream[i]);
+            stat = cudaStreamDestroy(nb->stream[i]);
              CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
  
              stat = cudaEventDestroy(timers->start_nb_h2d[i]);
@@ -1006,7 +1011,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
  
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
      /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-    if (cu_nb->dev_info->prop.major >= 3)
+    if (nb->dev_info->prop.major >= 3)
      {
          stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
          CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
@@ -1023,7 +1028,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
      {
  #ifdef HAVE_CUDA_TEXOBJ_SUPPORT
          /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (cu_nb->dev_info->prop.major >= 3)
+        if (nb->dev_info->prop.major >= 3)
          {
              stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
              CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
@@ -1054,7 +1059,7 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
      cu_free_buffered(plist->sci, &plist->nsci, &plist->sci_nalloc);
      cu_free_buffered(plist->cj4, &plist->ncj4, &plist->cj4_nalloc);
      cu_free_buffered(plist->excl, &plist->nexcl, &plist->excl_nalloc);
-    if (cu_nb->bUseTwoStreams)
+    if (nb->bUseTwoStreams)
      {
          cu_free_buffered(plist_nl->sci, &plist_nl->nsci, &plist_nl->sci_nalloc);
          cu_free_buffered(plist_nl->cj4, &plist_nl->ncj4, &plist_nl->cj4_nalloc);
@@ -1064,13 +1069,13 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
      sfree(atdat);
      sfree(nbparam);
      sfree(plist);
-    if (cu_nb->bUseTwoStreams)
+    if (nb->bUseTwoStreams)
      {
          sfree(plist_nl);
      }
      sfree(timers);
-    sfree(cu_nb->timings);
-    sfree(cu_nb);
+    sfree(nb->timings);
+    sfree(nb);
  
      if (debug)
      {
@@ -1078,21 +1083,21 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
      }
  }
  
-void cu_synchstream_atdat(nbnxn_cuda_ptr_t cu_nb, int iloc)
+void cu_synchstream_atdat(gmx_nbnxn_cuda_t *nb, int iloc)
  {
      cudaError_t  stat;
-    cudaStream_t stream = cu_nb->stream[iloc];
+    cudaStream_t stream = nb->stream[iloc];
  
-    stat = cudaStreamWaitEvent(stream, cu_nb->timers->stop_atdat, 0);
+    stat = cudaStreamWaitEvent(stream, nb->timers->stop_atdat, 0);
      CU_RET_ERR(stat, "cudaStreamWaitEvent failed");
  }
  
-wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t cu_nb)
+gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_cuda_t *nb)
  {
-    return (cu_nb != NULL && cu_nb->bDoTime) ? cu_nb->timings : NULL;
+    return (nb != NULL && nb->bDoTime) ? nb->timings : NULL;
  }
  
-void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv)
+void nbnxn_gpu_reset_timings(nonbonded_verlet_t* nbv)
  {
      /* The NVPROF_ID environment variable is set by nvprof and indicates that
         mdrun is executed in the CUDA profiler.
@@ -1107,21 +1112,21 @@ void nbnxn_cuda_reset_timings(nonbonded_verlet_t* nbv)
          CU_RET_ERR(stat, "cudaProfilerStart failed");
      }
  
-    if (nbv->cu_nbv && nbv->cu_nbv->bDoTime)
+    if (nbv->gpu_nbv && nbv->gpu_nbv->bDoTime)
      {
-        init_timings(nbv->cu_nbv->timings);
+        init_timings(nbv->gpu_nbv->timings);
      }
  }
  
-int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t cu_nb)
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_cuda_t *nb)
  {
-    return cu_nb != NULL ?
-           gpu_min_ci_balanced_factor*cu_nb->dev_info->prop.multiProcessorCount : 0;
+    return nb != NULL ?
+           gpu_min_ci_balanced_factor*nb->dev_info->prop.multiProcessorCount : 0;
  
  }
  
-gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t cu_nb)
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_cuda_t *nb)
  {
-    return ((cu_nb->nbparam->eeltype == eelCuEWALD_ANA) ||
-            (cu_nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
+    return ((nb->nbparam->eeltype == eelCuEWALD_ANA) ||
+            (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
  }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h

deleted file mode 100644 (file)

index 00cac2e..0000000
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-
-#ifndef NBNXN_CUDA_DATA_MGMT_H
-#define NBNXN_CUDA_DATA_MGMT_H
-
-#include "config.h"
-
-#include "gromacs/legacyheaders/types/hw_info.h"
-#include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
-#include "gromacs/legacyheaders/types/simple.h"
-
-#ifdef GMX_GPU
-#define FUNC_TERM ;
-#define FUNC_QUALIFIER
-#else
-#define FUNC_TERM {}
-#define FUNC_QUALIFIER static
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct nonbonded_verlet_group_t;
-struct nbnxn_pairlist_t;
-struct nbnxn_atomdata_t;
-
-/** Initializes the data structures related to CUDA nonbonded calculations. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init(FILE gmx_unused                 *fplog,
-                     nbnxn_cuda_ptr_t gmx_unused     *p_cu_nb,
-                     const gmx_gpu_info_t gmx_unused *gpu_info,
-                     const gmx_gpu_opt_t gmx_unused  *gpu_opt,
-                     int gmx_unused                   my_gpu_index,
-                     /* true of both local and non-local are don on GPU */
-                     gmx_bool gmx_unused              bLocalAndNonlocal) FUNC_TERM
-
-/** Initializes simulation constant data. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t               gmx_unused         cu_nb,
-                           const interaction_const_t      gmx_unused        *ic,
-                           const struct nonbonded_verlet_group_t gmx_unused *nbv_group) FUNC_TERM
-
-/** Initializes pair-list data for GPU, called at every pair search step. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init_pairlist(nbnxn_cuda_ptr_t       gmx_unused         cu_nb,
-                              const struct nbnxn_pairlist_t gmx_unused *h_nblist,
-                              int                    gmx_unused         iloc) FUNC_TERM
-
-/** Initializes atom-data on the GPU, called at every pair search step. */
-FUNC_QUALIFIER
-void nbnxn_cuda_init_atomdata(const nbnxn_cuda_ptr_t       gmx_unused   cu_nb,
-                              const struct nbnxn_atomdata_t gmx_unused *atomdata) FUNC_TERM
-
-/*! \brief Update parameters during PP-PME load balancing. */
-FUNC_QUALIFIER
-void nbnxn_cuda_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
-                                         const interaction_const_t gmx_unused       *ic) FUNC_TERM
-
-/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
-FUNC_QUALIFIER
-void nbnxn_cuda_upload_shiftvec(nbnxn_cuda_ptr_t       gmx_unused         cu_nb,
-                                const struct nbnxn_atomdata_t gmx_unused *nbatom) FUNC_TERM
-
-/** Clears GPU outputs: nonbonded force, shift force and energy. */
-FUNC_QUALIFIER
-void nbnxn_cuda_clear_outputs(nbnxn_cuda_ptr_t gmx_unused cu_nb,
-                              int              gmx_unused flags) FUNC_TERM
-
-/** Frees all GPU resources used for the nonbonded calculations. */
-FUNC_QUALIFIER
-void nbnxn_cuda_free(nbnxn_cuda_ptr_t gmx_unused  cu_nb) FUNC_TERM
-
-/** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
-FUNC_QUALIFIER
-wallclock_gpu_t * nbnxn_cuda_get_timings(nbnxn_cuda_ptr_t gmx_unused cu_nb)
-#ifdef GMX_GPU
-;
-#else
-{
-    return NULL;
-}
-#endif
-
-/** Resets nonbonded GPU timings. */
-FUNC_QUALIFIER
-void nbnxn_cuda_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) FUNC_TERM
-
-/** Calculates the minimum size of proximity lists to improve SM load balance
- *  with CUDA non-bonded kernels. */
-FUNC_QUALIFIER
-int nbnxn_cuda_min_ci_balanced(nbnxn_cuda_ptr_t gmx_unused cu_nb)
-#ifdef GMX_GPU
-;
-#else
-{
-    return -1;
-}
-#endif
-
-/** Returns if analytical Ewald CUDA kernels are used. */
-FUNC_QUALIFIER
-gmx_bool nbnxn_cuda_is_kernel_ewald_analytical(const nbnxn_cuda_ptr_t gmx_unused cu_nb)
-#ifdef GMX_GPU
-;
-#else
-{
-    return FALSE;
-}
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#undef FUNC_TERM
-#undef FUNC_QUALIFIER
-
-#endif /* NBNXN_CUDA_DATA_MGMT_H */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu

new file mode 100644 (file)

index 0000000..91c523e
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu
@@ -0,0 +1,56 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \file
+ *  \brief Define CUDA implementation of nbnxn_gpu_git_support.h
+ *
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/mdlib/nbnxn_gpu_jit_support.h"
+
+void
+nbnxn_gpu_compile_kernels(int                        /*mygpu*/,
+                          int                        /*rank*/,
+                          const gmx_gpu_info_t      */*gpu_info*/,
+                          const gmx_gpu_opt_t       */*gpu_opt*/,
+                          const interaction_const_t */*ic*/)
+{
+    /* CUDA support does not use JIT (yet).
+     *
+     * It would be nice if this function inlined away to nothing, but
+     * it's only used during setup. */
+}
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h

index f39b2327af2055d3257f3548fa25f87fa45a31e1..4ffcc35190d6902d211aacde19580d891c51cfe9 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -50,7 +50,6 @@
  
  #include "gromacs/gmxlib/cuda_tools/cudautils.cuh"
  #include "gromacs/legacyheaders/types/interaction_const.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
  #include "gromacs/mdlib/nbnxn_pairlist.h"
  
  #ifndef HAVE_CUDA_TEXOBJ_SUPPORT
@@ -227,18 +226,18 @@ struct cu_timers
  /** \internal
   * \brief Main data structure for CUDA nonbonded force calculations.
   */
-struct nbnxn_cuda
+struct gmx_nbnxn_cuda_t
  {
-    cuda_dev_info_t *dev_info;       /**< CUDA device information                              */
-    bool             bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
-    bool             bUseStreamSync; /**< true if the standard cudaStreamSynchronize is used
-                                          and not memory polling-based waiting                 */
-    cu_atomdata_t   *atdat;          /**< atom data                                            */
-    cu_nbparam_t    *nbparam;        /**< parameters required for the non-bonded calc.         */
-    cu_plist_t      *plist[2];       /**< pair-list data structures (local and non-local)      */
-    nb_staging_t     nbst;           /**< staging area where fshift/energies get downloaded    */
+    struct gmx_device_info_t *dev_info;       /**< CUDA device information                              */
+    bool                      bUseTwoStreams; /**< true if doing both local/non-local NB work on GPU    */
+    bool                      bUseStreamSync; /**< true if the standard cudaStreamSynchronize is used
+                                                   and not memory polling-based waiting                 */
+    cu_atomdata_t            *atdat;          /**< atom data                                            */
+    cu_nbparam_t             *nbparam;        /**< parameters required for the non-bonded calc.         */
+    cu_plist_t               *plist[2];       /**< pair-list data structures (local and non-local)      */
+    nb_staging_t              nbst;           /**< staging area where fshift/energies get downloaded    */
  
-    cudaStream_t     stream[2];      /**< local and non-local GPU streams                      */
+    cudaStream_t              stream[2];      /**< local and non-local GPU streams                      */
  
      /** events used for synchronization */
      cudaEvent_t    nonlocal_done;    /**< event triggered when the non-local non-bonded kernel
@@ -250,9 +249,9 @@ struct nbnxn_cuda
       * concurrent streams, so we won't time if both l/nl work is done on GPUs.
       * Timer init/uninit is still done even with timing off so only the condition
       * setting bDoTime needs to be change if this CUDA "feature" gets fixed. */
-    bool             bDoTime;       /**< True if event-based timing is enabled.               */
-    cu_timers_t     *timers;        /**< CUDA event-based timers.                             */
-    wallclock_gpu_t *timings;       /**< Timing data.                                         */
+    bool                 bDoTime;   /**< True if event-based timing is enabled.               */
+    cu_timers_t         *timers;    /**< CUDA event-based timers.                             */
+    gmx_wallclock_gpu_t *timings;   /**< Timing data.                                         */
  };
  
  #ifdef __cplusplus
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h b/src/gromacs/mdlib/nbnxn_gpu.h

similarity index 56%

rename from src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h

rename to src/gromacs/mdlib/nbnxn_gpu.h

index 174a9054af9f902254ba00f56653e72f0cda62ad..e9899a81a3bc7fb8c100747ea54866d3cf5b5cf7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h
+++ b/src/gromacs/mdlib/nbnxn_gpu.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -32,19 +32,20 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifndef NBNXN_CUDA_H
-#define NBNXN_CUDA_H
+/*! \internal \file
+ *  \brief Declare interface for GPU execution for NBNXN module
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \ingroup module_mdlib
+ */
  
-#include "config.h"
+#ifndef GMX_MDLIB_NBNXN_GPU_H
+#define GMX_MDLIB_NBNXN_GPU_H
  
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
  #include "gromacs/legacyheaders/types/simple.h"
-
-#ifdef GMX_GPU
-#define FUNC_TERM ;
-#else
-#define FUNC_TERM {}
-#endif
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
  
  #ifdef __cplusplus
  extern "C" {
@@ -62,36 +63,41 @@ struct nbnxn_atomdata_t;
   *  The local and non-local interaction calculations are launched in two
   *  separate streams.
   */
-void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t              gmx_unused  cu_nb,
-                              const struct nbnxn_atomdata_t gmx_unused *nbdata,
-                              int                           gmx_unused  flags,
-                              int                           gmx_unused  iloc) FUNC_TERM
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                             const struct nbnxn_atomdata_t gmx_unused *nbdata,
+                             int gmx_unused                            flags,
+                             int gmx_unused                            iloc) GPU_FUNC_TERM
  
  /*! \brief
   * Launch asynchronously the download of nonbonded forces from the GPU
   * (and energies/shift forces if required).
   */
-void nbnxn_cuda_launch_cpyback(nbnxn_cuda_ptr_t              gmx_unused  cu_nb,
-                               const struct nbnxn_atomdata_t gmx_unused *nbatom,
-                               int                           gmx_unused  flags,
-                               int                           gmx_unused  aloc) FUNC_TERM
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused              *nb,
+                              const struct nbnxn_atomdata_t gmx_unused *nbatom,
+                              int                    gmx_unused         flags,
+                              int                    gmx_unused         aloc) GPU_FUNC_TERM
  
  /*! \brief
   * Wait for the asynchronously launched nonbonded calculations and data
   * transfers to finish.
   */
-void nbnxn_cuda_wait_gpu(nbnxn_cuda_ptr_t              gmx_unused  cu_nb,
-                         const struct nbnxn_atomdata_t gmx_unused *nbatom,
-                         int                           gmx_unused  flags,
-                         int                           gmx_unused  aloc,
-                         real                          gmx_unused *e_lj,
-                         real                          gmx_unused *e_el,
-                         rvec                          gmx_unused *fshift) FUNC_TERM
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                            const struct nbnxn_atomdata_t gmx_unused *nbatom,
+                            int                    gmx_unused         flags,
+                            int                    gmx_unused         aloc,
+                            real                   gmx_unused        *e_lj,
+                            real                   gmx_unused        *e_el,
+                            rvec                   gmx_unused        *fshift) GPU_FUNC_TERM
+
+/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
+GPU_FUNC_QUALIFIER
+int nbnxn_gpu_pick_ewald_kernel_type(bool gmx_unused bTwinCut) GPU_FUNC_TERM_WITH_RETURN(-1)
  
  #ifdef __cplusplus
  }
  #endif
  
-#undef FUNC_TERM
-
-#endif /* NBNXN_CUDA_H */
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h

new file mode 100644 (file)

index 0000000..3a917e8
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h
@@ -0,0 +1,131 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declare interface for GPU data transfer for NBNXN module
+ *
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \ingroup module_mdlib
+ *  \inlibraryapi
+ */
+
+#ifndef NBNXN_GPU_DATA_MGMT_H
+#define NBNXN_GPU_DATA_MGMT_H
+
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/legacyheaders/types/simple.h"
+#include "gromacs/mdlib/nbnxn_gpu_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct nonbonded_verlet_group_t;
+struct nbnxn_pairlist_t;
+struct nbnxn_atomdata_t;
+struct gmx_wallclock_gpu_t;
+struct gmx_gpu_info_t;
+
+/** Initializes the data structures related to GPU nonbonded calculations. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init(FILE gmx_unused                        *fplog,
+                    gmx_nbnxn_gpu_t gmx_unused            **p_nb,
+                    const struct gmx_gpu_info_t gmx_unused *gpu_info,
+                    const gmx_gpu_opt_t gmx_unused         *gpu_opt,
+                    int gmx_unused                          my_gpu_index,
+                    /* true of both local and non-local are don on GPU */
+                    gmx_bool gmx_unused                     bLocalAndNonlocal) GPU_FUNC_TERM
+
+/** Initializes simulation constant data. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init_const(gmx_nbnxn_gpu_t gmx_unused                       *nb,
+                          const interaction_const_t      gmx_unused        *ic,
+                          const struct nonbonded_verlet_group_t gmx_unused *nbv_group) GPU_FUNC_TERM
+
+/** Initializes pair-list data for GPU, called at every pair search step. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init_pairlist(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                             const struct nbnxn_pairlist_t gmx_unused *h_nblist,
+                             int                    gmx_unused         iloc) GPU_FUNC_TERM
+
+/** Initializes atom-data on the GPU, called at every pair search step. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_init_atomdata(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                             const struct nbnxn_atomdata_t gmx_unused *nbat) GPU_FUNC_TERM
+
+/*! \brief Re-generate the GPU Ewald force table, resets rlist, and update the
+ *  electrostatic type switching to twin cut-off (or back) if needed.
+ */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused *nbv,
+                                        const interaction_const_t gmx_unused       *ic) GPU_FUNC_TERM
+
+/** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_upload_shiftvec(gmx_nbnxn_gpu_t gmx_unused               *nb,
+                               const struct nbnxn_atomdata_t gmx_unused *nbatom) GPU_FUNC_TERM
+
+/** Clears GPU outputs: nonbonded force, shift force and energy. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_clear_outputs(gmx_nbnxn_gpu_t gmx_unused *nb,
+                             int              gmx_unused flags) GPU_FUNC_TERM
+
+/** Frees all GPU resources used for the nonbonded calculations. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_free(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM
+
+/** Returns the GPU timings structure or NULL if GPU is not used or timing is off. */
+GPU_FUNC_QUALIFIER
+struct gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(NULL)
+
+/** Resets nonbonded GPU timings. */
+GPU_FUNC_QUALIFIER
+void nbnxn_gpu_reset_timings(struct nonbonded_verlet_t gmx_unused *nbv) GPU_FUNC_TERM
+
+/** Calculates the minimum size of proximity lists to improve SM load balance
+ *  with GPU non-bonded kernels. */
+GPU_FUNC_QUALIFIER
+int nbnxn_gpu_min_ci_balanced(gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(-1)
+
+/** Returns if analytical Ewald GPU kernels are used. */
+GPU_FUNC_QUALIFIER
+gmx_bool nbnxn_gpu_is_kernel_ewald_analytical(const gmx_nbnxn_gpu_t gmx_unused *nb) GPU_FUNC_TERM_WITH_RETURN(FALSE)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_gpu_jit_support.h b/src/gromacs/mdlib/nbnxn_gpu_jit_support.h

new file mode 100644 (file)

index 0000000..f3c0c76
--- /dev/null
+++ b/src/gromacs/mdlib/nbnxn_gpu_jit_support.h
@@ -0,0 +1,60 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Declares functions that support JIT compilation (e.g. for OpenCL)
+ *
+ *  \author Dimitrios Karkoulis <dimitris.karkoulis@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ */
+
+#ifndef GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
+#define GMX_MDLIB_NBNXN_GPU_JIT_SUPPORT_H
+
+#include "gromacs/gmxlib/gpu_utils/gpu_macros.h"
+#include "gromacs/legacyheaders/types/hw_info.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/legacyheaders/types/simple.h"
+
+struct gmx_gpu_info_t;
+
+/*! \brief Handles any JIT compilation of nbnxn kernels for the GPU given by \p mygpu */
+GPU_FUNC_QUALIFIER void
+nbnxn_gpu_compile_kernels(int                       gmx_unused  mygpu,
+                          int                       gmx_unused  rank,
+                          const gmx_gpu_info_t      gmx_unused *gpu_info,
+                          const gmx_gpu_opt_t       gmx_unused *gpu_opt,
+                          const interaction_const_t gmx_unused *ic) GPU_FUNC_TERM
+
+#endif
diff --git a/src/gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h b/src/gromacs/mdlib/nbnxn_gpu_types.h

similarity index 55%

rename from src/gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h

rename to src/gromacs/mdlib/nbnxn_gpu_types.h

index 9efcc19fbb3560becfe01c00565ce5d588343499..f1fe520338ff91bfe64b75b738876144d324c755 100644 (file)
--- a/src/gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h
+++ b/src/gromacs/mdlib/nbnxn_gpu_types.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -33,45 +33,28 @@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
-#ifndef NBNXN_CUDA_TYPES_EXT_H
-#define NBNXN_CUDA_TYPES_EXT_H
+#ifndef GMX_MDLIB_NBNXN_GPU_TYPES_H
+#define GMX_MDLIB_NBNXN_GPU_TYPES_H
+
+#include "config.h"
  
  #ifdef __cplusplus
  extern "C" {
  #endif
  
-/* Abstract types */
-/* CUDA nonbonded structure */
-typedef struct nbnxn_cuda *nbnxn_cuda_ptr_t;
-/* CUDA GPU device info */
-typedef struct cuda_dev_info *cuda_dev_info_ptr_t;
+#ifdef GMX_GPU
+
+struct gmx_nbnxn_cuda_t;
+typedef struct gmx_nbnxn_cuda_t gmx_nbnxn_gpu_t;
  
-/* Types defined for the structs below. */
-typedef struct wallclock_gpu wallclock_gpu_t;
-typedef struct nbnxn_cuda_ktime nbnxn_cuda_ktime_t;
+#else
  
-/* Nonbonded kernel time and call count. */
-struct nbnxn_cuda_ktime
-{
-    double  t;
-    int     c;
-};
+typedef int gmx_nbnxn_gpu_t;
  
-/* GPU timings for kernels and H2d/D2H transfers. */
-struct wallclock_gpu
-{
-    nbnxn_cuda_ktime_t ktime[2][2]; /* table containing the timings of the four
-                                       version of the nonbonded kernels: force-only,
-                                       force+energy, force+pruning, and force+energy+pruning */
-    double  nb_h2d_t;               /* host to device transfer time in nb calculation  */
-    double  nb_d2h_t;               /* device to host transfer time in nb calculation */
-    int     nb_c;                   /* total call count of the nonbonded gpu operations */
-    double  pl_h2d_t;               /* pair search step host to device transfer time */
-    int     pl_h2d_c;               /* pair search step  host to device transfer call count */
-};
+#endif
  
  #ifdef __cplusplus
  }
  #endif
  
-#endif /* NBNXN_CUDA_TYPES_EXT_H */
+#endif
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index 52d698b11b6d7481d681fb6f115961b2145dec57..1a2a6ac4ea3d3ab10f1363a2d08e55a93295f291 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -241,7 +241,7 @@ static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
          case nbnxnk4xN_SIMD_4xN:
          case nbnxnk4xN_SIMD_2xNN:
              return NBNXN_CPU_CLUSTER_I_SIZE;
-        case nbnxnk8x8x8_CUDA:
+        case nbnxnk8x8x8_GPU:
          case nbnxnk8x8x8_PlainC:
              /* The cluster size for super/sub lists is only set here.
               * Any value should work for the pair-search and atomdata code.
@@ -275,7 +275,7 @@ int nbnxn_kernel_to_cj_size(int nb_kernel_type)
          case nbnxnk4xN_SIMD_2xNN:
              cj_size = nbnxn_simd_width/2;
              break;
-        case nbnxnk8x8x8_CUDA:
+        case nbnxnk8x8x8_GPU:
          case nbnxnk8x8x8_PlainC:
              cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
              break;
@@ -307,7 +307,7 @@ gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
  
      switch (nb_kernel_type)
      {
-        case nbnxnk8x8x8_CUDA:
+        case nbnxnk8x8x8_GPU:
          case nbnxnk8x8x8_PlainC:
              return FALSE;
  
@@ -5322,7 +5322,7 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
                                              break;
  #endif
                                          case nbnxnk8x8x8_PlainC:
-                                        case nbnxnk8x8x8_CUDA:
+                                        case nbnxnk8x8x8_GPU:
                                              check_subcell_list_space_supersub(nbl, cl-cf+1);
                                              for (cj = cf; cj <= cl; cj++)
                                              {
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index b589455c64470774844c55f11b2699933d49683e..7445698b6a638a499011c55834aa561752e91170 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -76,9 +76,8 @@
  #include "gromacs/math/vec.h"
  #include "gromacs/mdlib/nb_verlet.h"
  #include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/mdlib/nbnxn_search.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
  #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  #include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
  #include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
@@ -88,6 +87,7 @@
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/timing/gpu_timing.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
  #include "gromacs/utility/cstringutil.h"
@@ -96,6 +96,7 @@
  #include "gromacs/utility/sysinfo.h"
  
  #include "adress.h"
+#include "nbnxn_gpu.h"
  
  void print_time(FILE                     *out,
                  gmx_walltime_accounting_t walltime_accounting,
@@ -462,7 +463,7 @@ static void do_nb_verlet(t_forcerec *fr,
  {
      int                        enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
      nonbonded_verlet_group_t  *nbvg;
-    gmx_bool                   bCUDA;
+    gmx_bool                   bUsingGpuKernels;
  
      if (!(flags & GMX_FORCE_NONBONDED))
      {
@@ -472,15 +473,15 @@ static void do_nb_verlet(t_forcerec *fr,
  
      nbvg = &fr->nbv->grp[ilocality];
  
-    /* CUDA kernel launch overhead is already timed separately */
+    /* GPU kernel launch overhead is already timed separately */
      if (fr->cutoff_scheme != ecutsVERLET)
      {
          gmx_incons("Invalid cut-off scheme passed!");
      }
  
-    bCUDA = (nbvg->kernel_type == nbnxnk8x8x8_CUDA);
+    bUsingGpuKernels = (nbvg->kernel_type == nbnxnk8x8x8_GPU);
  
-    if (!bCUDA)
+    if (!bUsingGpuKernels)
      {
          wallcycle_sub_start(wcycle, ewcsNONBONDED);
      }
@@ -526,8 +527,8 @@ static void do_nb_verlet(t_forcerec *fr,
                                     enerd->grpp.ener[egLJSR]);
              break;
  
-        case nbnxnk8x8x8_CUDA:
-            nbnxn_cuda_launch_kernel(fr->nbv->cu_nbv, nbvg->nbat, flags, ilocality);
+        case nbnxnk8x8x8_GPU:
+            nbnxn_gpu_launch_kernel(fr->nbv->gpu_nbv, nbvg->nbat, flags, ilocality);
              break;
  
          case nbnxnk8x8x8_PlainC:
@@ -548,7 +549,7 @@ static void do_nb_verlet(t_forcerec *fr,
              gmx_incons("Invalid nonbonded kernel type passed!");
  
      }
-    if (!bCUDA)
+    if (!bUsingGpuKernels)
      {
          wallcycle_sub_stop(wcycle, ewcsNONBONDED);
      }
@@ -557,8 +558,8 @@ static void do_nb_verlet(t_forcerec *fr,
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
      }
-    else if ((!bCUDA && nbvg->ewald_excl == ewaldexclAnalytical) ||
-             (bCUDA && nbnxn_cuda_is_kernel_ewald_analytical(fr->nbv->cu_nbv)))
+    else if ((!bUsingGpuKernels && nbvg->ewald_excl == ewaldexclAnalytical) ||
+             (bUsingGpuKernels && nbnxn_gpu_is_kernel_ewald_analytical(fr->nbv->gpu_nbv)))
      {
          enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
      }
@@ -913,12 +914,12 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          if (bNS)
          {
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-            nbnxn_cuda_init_atomdata(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+            nbnxn_gpu_init_atomdata(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
  
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-        nbnxn_cuda_upload_shiftvec(nbv->cu_nbv, nbv->grp[eintLocal].nbat);
+        nbnxn_gpu_upload_shiftvec(nbv->gpu_nbv, nbv->grp[eintLocal].nbat);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
      }
  
@@ -940,9 +941,9 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          if (bUseGPU)
          {
              /* initialize local pair-list on the GPU */
-            nbnxn_cuda_init_pairlist(nbv->cu_nbv,
-                                     nbv->grp[eintLocal].nbl_lists.nbl[0],
-                                     eintLocal);
+            nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
+                                    nbv->grp[eintLocal].nbl_lists.nbl[0],
+                                    eintLocal);
          }
          wallcycle_stop(wcycle, ewcNS);
      }
@@ -1008,12 +1009,12 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
  
-            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_CUDA)
+            if (nbv->grp[eintNonlocal].kernel_type == nbnxnk8x8x8_GPU)
              {
                  /* initialize non-local pair-list on the GPU */
-                nbnxn_cuda_init_pairlist(nbv->cu_nbv,
-                                         nbv->grp[eintNonlocal].nbl_lists.nbl[0],
-                                         eintNonlocal);
+                nbnxn_gpu_init_pairlist(nbv->gpu_nbv,
+                                        nbv->grp[eintNonlocal].nbl_lists.nbl[0],
+                                        eintNonlocal);
              }
              wallcycle_stop(wcycle, ewcNS);
          }
@@ -1053,11 +1054,11 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
          if (DOMAINDECOMP(cr) && !bDiffKernels)
          {
-            nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintNonlocal].nbat,
-                                      flags, eatNonlocal);
+            nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintNonlocal].nbat,
+                                     flags, eatNonlocal);
          }
-        nbnxn_cuda_launch_cpyback(nbv->cu_nbv, nbv->grp[eintLocal].nbat,
-                                  flags, eatLocal);
+        nbnxn_gpu_launch_cpyback(nbv->gpu_nbv, nbv->grp[eintLocal].nbat,
+                                 flags, eatLocal);
          cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
      }
  
@@ -1283,11 +1284,11 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                  float cycles_tmp;
  
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                nbnxn_cuda_wait_gpu(nbv->cu_nbv,
-                                    nbv->grp[eintNonlocal].nbat,
-                                    flags, eatNonlocal,
-                                    enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                    fr->fshift);
+                nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+                                       nbv->grp[eintNonlocal].nbat,
+                                       flags, eatNonlocal,
+                                       enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                       fr->fshift);
                  cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
                  cycles_wait_gpu += cycles_tmp;
                  cycles_force    += cycles_tmp;
@@ -1346,11 +1347,11 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              const float cuda_api_overhead_margin = 50000.0f; /* cycles */
  
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-            nbnxn_cuda_wait_gpu(nbv->cu_nbv,
-                                nbv->grp[eintLocal].nbat,
-                                flags, eatLocal,
-                                enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
-                                fr->fshift);
+            nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
+                                   nbv->grp[eintLocal].nbat,
+                                   flags, eatLocal,
+                                   enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+                                   fr->fshift);
              cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
              if (bDoForces && DOMAINDECOMP(cr))
@@ -1384,7 +1385,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              /* now clear the GPU outputs while we finish the step on the CPU */
  
              wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU_NB);
-            nbnxn_cuda_clear_outputs(nbv->cu_nbv, flags);
+            nbnxn_gpu_clear_outputs(nbv->gpu_nbv, flags);
              wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
          else
@@ -2625,8 +2626,8 @@ void finish_run(FILE *fplog, t_commrec *cr,
  
      if (SIMMASTER(cr))
      {
-        wallclock_gpu_t* gputimes = use_GPU(nbv) ?
-            nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
+        struct gmx_wallclock_gpu_t* gputimes = use_GPU(nbv) ? nbnxn_gpu_get_timings(nbv->gpu_nbv) : NULL;
+
          wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
                          elapsed_time_over_all_ranks,
                          wcycle, gputimes);
diff --git a/src/gromacs/timing/gpu_timing.h b/src/gromacs/timing/gpu_timing.h

new file mode 100644 (file)

index 0000000..772b5ee
--- /dev/null
+++ b/src/gromacs/timing/gpu_timing.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2012,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declares data types for GPU timing
+ *
+ *  \author Szilard Pall <pall.szilard@gmail.com>
+ *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \inlibraryapi
+ */
+
+#ifndef GMX_TIMING_GPU_TIMING_H
+#define GMX_TIMING_GPU_TIMING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! \internal \brief Nonbonded kernel time and call count. */
+struct gmx_nbnxn_kernel_timing_data_t
+{
+    double  t; /**< Accumulated lapsed time */
+    int     c; /**< Number of calls corresponding to the elapsed time */
+};
+
+/*! \internal \brief GPU timings for kernels and H2d/D2H transfers. */
+struct gmx_wallclock_gpu_t
+{
+    struct gmx_nbnxn_kernel_timing_data_t ktime[2][2]; /**< table containing the timings of the four
+                                                          versions of the nonbonded kernels: force-only,
+                                                          force+energy, force+pruning, and force+energy+pruning */
+    double  nb_h2d_t;                                  /**< host to device transfer time in nb calculation  */
+    double  nb_d2h_t;                                  /**< device to host transfer time in nb calculation */
+    int     nb_c;                                      /**< total call count of the nonbonded gpu operations */
+    double  pl_h2d_t;                                  /**< pair search step host to device transfer time */
+    int     pl_h2d_c;                                  /**< pair search step  host to device transfer call count */
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gromacs/timing/wallcycle.c b/src/gromacs/timing/wallcycle.c

index 81f58b7aeccb2c5ecb375981ac6c7f0ece9ebf6a..2983cd74ecc8c1dc1ab9f236e227fdef9ab75d3a 100644 (file)
--- a/src/gromacs/timing/wallcycle.c
+++ b/src/gromacs/timing/wallcycle.c
@@ -45,6 +45,7 @@
  #include "gromacs/legacyheaders/md_logging.h"
  #include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/timing/cyclecounter.h"
+#include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/smalloc.h"
@@ -634,7 +635,7 @@ static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, i
  }
  
  void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
-                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
+                     gmx_wallcycle_t wc, struct gmx_wallclock_gpu_t *gpu_t)
  {
      double     *cyc_sum;
      double      tot, tot_for_pp, tot_for_rest, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, tot_k;
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h

index d0295b67dd8f6288a933f6bcd0d94ce0513bd3c9..1ca527949fcf98b18153b03d7b999f66400f8d09 100644 (file)
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -43,7 +43,6 @@
  #include <stdio.h>
  
  #include "gromacs/legacyheaders/types/commrec_fwd.h"
-#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
  #include "gromacs/utility/basedefinitions.h"
  
  #ifdef __cplusplus
@@ -51,6 +50,7 @@ extern "C" {
  #endif
  
  typedef struct gmx_wallcycle *gmx_wallcycle_t;
+struct gmx_wallclock_gpu_t;
  
  enum {
      ewcRUN, ewcSTEP, ewcPPDURINGPME, ewcDOMDEC, ewcDDCOMMLOAD,
@@ -105,7 +105,7 @@ void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc);
  /* Sum the cycles over the nodes in cr->mpi_comm_mysim */
  
  void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
-                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t);
+                     gmx_wallcycle_t wc, struct gmx_wallclock_gpu_t *gpu_t);
  /* Print the cycle and time accounting */
  
  gmx_int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc);
diff --git a/src/programs/mdrun/md.cpp b/src/programs/mdrun/md.cpp

index b497e5dd807a3b7b171af9927209dd49b94ba180..de011d5cb3a1a6653b990819982d99dd5ab0e659 100644 (file)
--- a/src/programs/mdrun/md.cpp
+++ b/src/programs/mdrun/md.cpp
@@ -95,7 +95,7 @@
  #include "gromacs/mdlib/compute_io.h"
  #include "gromacs/mdlib/mdrun_signalling.h"
  #include "gromacs/mdlib/nb_verlet.h"
-#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
@@ -133,7 +133,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
      md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                    gmx_step_str(step, sbuf));
  
-    nbnxn_cuda_reset_timings(nbv);
+    nbnxn_gpu_reset_timings(nbv);
  
      wallcycle_stop(wcycle, ewcRUN);
      wallcycle_reset_all(wcycle);
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp

index 0e5449822ee6a2f91fa154e7bc0c2e9a2121d575..4422b7a13d450510957a425955e3279da9562da2 100644 (file)
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -50,6 +50,7 @@
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/fileio/tpxio.h"
+#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  #include "gromacs/legacyheaders/checkpoint.h"
  #include "gromacs/legacyheaders/constr.h"
  #include "gromacs/legacyheaders/disre.h"
@@ -94,8 +95,6 @@
  #include "corewrap.h"
  #endif
  
-#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
-
  typedef struct {
      gmx_integrator_t *func;
  } gmx_intp_t;
@@ -374,10 +373,11 @@ static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
      }
  
      bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
-                  hwinfo->gpu_info.ncuda_dev_compatible > 0);
+                  hwinfo->gpu_info.n_dev_compatible > 0);
+
      if (bCanUseGPU)
      {
-        ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
+        ngpu = hwinfo->gpu_info.n_dev_compatible;
      }
      else
      {
@@ -857,10 +857,12 @@ static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
      gmx_parse_gpu_ids(&hw_opt->gpu_opt);
  
  #ifdef GMX_THREAD_MPI
-    if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
+    if (hw_opt->gpu_opt.n_dev_use > 0
+        &&
+        hw_opt->nthreads_tmpi == 0)
      {
          /* Set the number of MPI threads equal to the number of GPUs */
-        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
+        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.n_dev_use;
  
          if (hw_opt->nthreads_tot > 0 &&
              hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
@@ -1028,7 +1030,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
          if (inputrec->cutoff_scheme == ecutsVERLET)
          {
              /* Here the master rank decides if all ranks will use GPUs */
-            bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
+            bUseGPU = (hwinfo->gpu_info.n_dev_compatible > 0 ||
                         getenv("GMX_EMULATE_GPU") != NULL);
  
              /* TODO add GPU kernels for this and replace this check by:
@@ -1058,7 +1060,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                  gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme");
              }
  
-            if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
+            if (hwinfo->gpu_info.n_dev_compatible > 0)
              {
                  md_print_warn(cr, fplog,
                                "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
@@ -1411,7 +1413,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
      else
      {
          /* Ignore (potentially) manually selected GPUs */
-        hw_opt->gpu_opt.ncuda_dev_use = 0;
+        hw_opt->gpu_opt.n_dev_use = 0;
      }
  
      /* check consistency across ranks of things like SIMD
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 12 Dec 2014 15:56:27 +0000 (16:56 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 30 Apr 2015 08:43:05 +0000 (10:43 +0200)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/ewald/pme-load-balancing.c		patch \| blob \| history
src/gromacs/gmxlib/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/copyrite.cpp		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/copyrite_gpu.cu		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/cudautils.cuh		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.cu		patch \| blob \| history
src/gromacs/gmxlib/cuda_tools/pmalloc_cuda.h		patch \| blob \| history
src/gromacs/gmxlib/gmx_detect_hardware.cpp		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/CMakeLists.txt		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_macros.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/gpu_utils/gpu_utils.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gmxlib/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gmxlib/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gmxlib/nrnb.c		patch \| blob \| history
src/gromacs/legacyheaders/force.h		patch \| blob \| history
src/gromacs/legacyheaders/types/hw_info.h		patch \| blob \| history
src/gromacs/math/utilities.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nb_verlet.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.c		patch \| blob \| history
src/gromacs/mdlib/nbnxn_consts.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h	[deleted file]	patch \| blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_jit_support.cu	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu.h	[moved from src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h with 56% similarity]	patch \| blob \| history
src/gromacs/mdlib/nbnxn_gpu_data_mgmt.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_gpu_jit_support.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/nbnxn_gpu_types.h	[moved from src/gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h with 55% similarity]	patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.c		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/timing/gpu_timing.h	[new file with mode: 0644]	patch \| blob
src/gromacs/timing/wallcycle.c		patch \| blob \| history
src/gromacs/timing/wallcycle.h		patch \| blob \| history
src/programs/mdrun/md.cpp		patch \| blob \| history
src/programs/mdrun/runner.cpp		patch \| blob \| history