Make PME GPU structures use DeviceBuffer

author Aleksei Iupinov <a.yupinov@gmail.com>

Thu, 3 May 2018 14:20:50 +0000 (16:20 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Wed, 30 May 2018 23:42:12 +0000 (01:42 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Thu, 3 May 2018 14:20:50 +0000 (16:20 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Wed, 30 May 2018 23:42:12 +0000 (01:42 +0200)
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt

index f4b6337bebd1cabbdd62178c87597cb661ee3fe6..b84d7833239b9988a92de0fa6cbcf8117a2e9da2 100644 (file)
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -39,6 +39,7 @@ if (GMX_CLANG_CUDA)
  endif()
  
  set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
  set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
  set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
  
@@ -57,6 +58,12 @@ function (gmx_add_libgromacs_sources)
      _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
  endfunction ()
  
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
  function (gmx_install_headers)
      if (NOT GMX_BUILD_MDRUN_ONLY)
          file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
@@ -158,11 +165,18 @@ gmx_configure_version_file(
      REMOTE_HASH)
  list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
  
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+    get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+    set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
  # set up CUDA compilation with clang
  if (GMX_CLANG_CUDA)
      foreach (_file ${LIBGROMACS_SOURCES})
          get_filename_component(_ext ${_file} EXT)
-        if (${_ext} STREQUAL ".cu")
+        get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+        if (${_ext} STREQUAL ".cu" OR _cuda_source_format)
              gmx_compile_cuda_file_with_clang(${_file})
          endif()
      endforeach()
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt

index a8f444ce6f216f488b1470f519c4678c065c3687..a95e1ed3674eaf570edceb6597e8fa0d806f2a91 100644 (file)
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -64,6 +64,9 @@ if (GMX_USE_CUDA)
          pme-gpu.cpp
          pme-gpu-internal.cpp
          )
+    gmx_compile_cpp_as_cuda(
+        pme-gpu-internal.cpp
+        )
  else()
      gmx_add_libgromacs_sources(
          # Files that implement stubs
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index 105f88fad72526e441614b21756680c72d128430..918f74770704ce04de8becb3c4366a2efa145267 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -38,6 +38,9 @@
   * \brief This file contains internal function implementations
   * for performing the PME calculations on GPU.
   *
+ * Note that this file is compiled as regular C++ source in OpenCL builds, but
+ * it is treated as CUDA source in CUDA-enabled GPU builds.
+ *
   * \author Aleksei Iupinov <a.yupinov@gmail.com>
   * \ingroup module_ewald
   */
@@ -53,7 +56,6 @@
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/math/units.h"
-#include "gromacs/mdtypes/commrec.h"
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h

index 936a391455ef4c727a533ff9751487b53b76b1f8..55d04169623a7959c19eb94afb072fae64d6823e 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -44,6 +44,8 @@
  #ifndef GMX_EWALD_PME_GPU_TYPES_H
  #define GMX_EWALD_PME_GPU_TYPES_H
  
+#include "gromacs/gpu_utils/devicebuffer.h"
+
  /* What follows is all the PME GPU function arguments,
   * sorted into several device-side structures depending on the update rate.
   * This is GPU agnostic (float3 replaced by float[3], etc.).
@@ -61,7 +63,7 @@ struct PmeGpuConstParams
      float elFactor;
      /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
       * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
-    float *d_virialAndEnergy;
+    DeviceBuffer<float> d_virialAndEnergy;
  };
  
  /*! \internal \brief
@@ -82,11 +84,11 @@ struct PmeGpuGridParams
      /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
      int   complexGridSizePadded[DIM];
  
-    /* Grid pointers */
+    /* Grid arrays */
      /*! \brief Real space grid. */
-    float *d_realGrid;
-    /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
-    float *d_fourierGrid;
+    DeviceBuffer<float> d_realGrid;
+    /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same handle as realGrid. */
+    DeviceBuffer<float> d_fourierGrid;
  
      /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
      float ewaldFactor;
@@ -94,17 +96,17 @@ struct PmeGpuGridParams
      /*! \brief Grid spline values as in pme->bsp_mod
       * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
       */
-    float              *d_splineModuli;
+    DeviceBuffer<float> d_splineModuli;
      /*! \brief Offsets for X/Y/Z components of d_splineModuli */
      int                 splineValuesOffset[DIM];
  
      /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
-    float               *d_fractShiftsTable;
+    DeviceBuffer<float> d_fractShiftsTable;
      /*! \brief Gridline indices lookup table
       * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
-    int                *d_gridlineIndicesTable;
+    DeviceBuffer<int> d_gridlineIndicesTable;
      /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
-    int                 tablesOffsets[DIM];
+    int               tablesOffsets[DIM];
  };
  
  /*! \internal \brief
@@ -115,32 +117,32 @@ struct PmeGpuAtomParams
  {
      /*! \brief Number of local atoms */
      int    nAtoms;
-    /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
+    /*! \brief Global GPU memory array handle with input rvec atom coordinates.
       * The coordinates themselves change and need to be copied to the GPU for every PME computation,
       * but reallocation happens only at DD.
       */
-    float *d_coordinates;
-    /*! \brief Pointer to the global GPU memory with input atom charges.
+    DeviceBuffer<float> d_coordinates;
+    /*! \brief Global GPU memory array handle with input atom charges.
       * The charges only need to be reallocated and copied to the GPU at DD step.
       */
-    float  *d_coefficients;
-    /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
+    DeviceBuffer<float> d_coefficients;
+    /*! \brief Global GPU memory array handle with input/output rvec atom forces.
       * The forces change and need to be copied from (and possibly to) the GPU for every PME computation,
       * but reallocation happens only at DD.
       */
-    float  *d_forces;
-    /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
+    DeviceBuffer<float> d_forces;
+    /*! \brief Global GPU memory array handle with ivec atom gridline indices.
       * Computed on GPU in the spline calculation part.
       */
-    int *d_gridlineIndices;
+    DeviceBuffer<int> d_gridlineIndices;
  
      /* B-spline parameters are computed entirely on GPU for every PME computation, not copied.
       * Unless we want to try something like GPU spread + CPU gather?
       */
-    /*! \brief Pointer to the global GPU memory with B-spline values */
-    float  *d_theta;
-    /*! \brief Pointer to the global GPU memory with B-spline derivative values */
-    float  *d_dtheta;
+    /*! \brief Global GPU memory array handle with B-spline values */
+    DeviceBuffer<float> d_theta;
+    /*! \brief Global GPU memory array handle with B-spline derivative values */
+    DeviceBuffer<float> d_dtheta;
  };
  
  /*! \internal \brief
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index 9d1d6c7cfe65cb2a2795b29b0ece90b326865d83..415982f319cee043956f651e80022259ecfd07fa 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -55,6 +55,7 @@
  
  #include "pme.cuh"
  #include "pme-3dfft.cuh"
+#include "pme-gpu-program-impl.h"
  #include "pme-grid.h"
  
  int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
@@ -80,16 +81,13 @@ void pme_gpu_synchronize(const PmeGpu *pmeGpu)
  void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
  {
      const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
-    cudaError_t  stat                = cudaMalloc((void **)&pmeGpu->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
-    CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
+    allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, pmeGpu->archSpecific->context);
      pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
  }
  
  void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
  {
-    cudaError_t stat = cudaFree(pmeGpu->kernelParams->constants.d_virialAndEnergy);
-    CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
-    pmeGpu->kernelParams->constants.d_virialAndEnergy = nullptr;
+    freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
      pfree(pmeGpu->staging.h_virialAndEnergy);
      pmeGpu->staging.h_virialAndEnergy = nullptr;
  }
@@ -114,7 +112,7 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
          pmeGpu->kernelParams->grid.realGridSize[ZZ];
      const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
      reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
-                           &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
      if (shouldRealloc)
      {
          /* Reallocate the host buffer */
@@ -142,7 +140,7 @@ void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
      const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
      GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
-                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
      pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
      pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
  }
@@ -175,7 +173,7 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
      const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
      GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
-                           &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->context);
      if (c_usePadding)
      {
          const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
@@ -213,7 +211,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const flo
      const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
      GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
-                           &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
                         0, pmeGpu->kernelParams->atoms.nAtoms,
                         pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
@@ -246,9 +244,9 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
      int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
      int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
-                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
+                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->context);
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
-                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
      // the host side reallocation
      if (shouldRealloc)
      {
@@ -273,7 +271,7 @@ void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
      const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
      GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
-                           &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
      pfree(pmeGpu->staging.h_gridlineIndices);
      pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
  }
@@ -298,16 +296,16 @@ void pme_gpu_realloc_grids(PmeGpu *pmeGpu)
      {
          /* 2 separate grids */
          reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
-                               &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                               &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
          reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
-                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
      }
      else
      {
          /* A single buffer so that any grid will fit */
          const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
          reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
-                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
          kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
          pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
          // the size might get used later for copying the grid
@@ -458,6 +456,9 @@ void pme_gpu_init_internal(PmeGpu *pmeGpu)
      // TODO: Consider turning on by default when we can detect nr of streams.
      pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
  
+    // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
+    pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
+
      // Prepare to use the device that this PME task was assigned earlier.
      CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
  
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index 55aca93b5bfeb684c05434f4a896191b2652b212..eafe5f6ddc526cf2938e166f07c59be36f284776 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -209,6 +209,14 @@ struct PmeGpuCuda
      /*! \brief The CUDA stream where everything related to the PME happens. */
      cudaStream_t pmeStream;
  
+    /*! \brief
+     * A handle to the GPU context.
+     * TODO: this is currently extracted from the implementation of pmeGpu->programHandle_,
+     * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram,
+     * managed by high-level code.
+     */
+    Context context;
+
      /* Synchronization events */
      /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
      cudaEvent_t syncSpreadGridD2H;
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Thu, 3 May 2018 14:20:50 +0000 (16:20 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 30 May 2018 23:42:12 +0000 (01:42 +0200)
src/gromacs/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-types.h		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history