Make PME GPU structures use DeviceBuffer
authorAleksei Iupinov <a.yupinov@gmail.com>
Thu, 3 May 2018 14:20:50 +0000 (16:20 +0200)
committerSzilárd Páll <pall.szilard@gmail.com>
Wed, 30 May 2018 23:42:12 +0000 (01:42 +0200)
A couple related function calls in PME are changed to match.
allocateDeviceBuffer() is now correctly called with the GPU
context handle instead of a stream handle (this was not causing
errors since context is only a stub in CUDA implementation).
pme-gpu-internal.cpp is now compiled with CUDA compiler
(to handle gputraits.cuh included through devicebuffer.h),
paving the way for using unified CUDA/OpenCL host code.

Change-Id: I3621bd93e4b2cc7c02161323f532ca62b48763ab

src/gromacs/CMakeLists.txt
src/gromacs/ewald/CMakeLists.txt
src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu-types.h
src/gromacs/ewald/pme.cu
src/gromacs/ewald/pme.cuh

index f4b6337bebd1cabbdd62178c87597cb661ee3fe6..b84d7833239b9988a92de0fa6cbcf8117a2e9da2 100644 (file)
@@ -39,6 +39,7 @@ if (GMX_CLANG_CUDA)
 endif()
 
 set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
+set_property(GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
 set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
 set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
 
@@ -57,6 +58,12 @@ function (gmx_add_libgromacs_sources)
     _gmx_add_files_to_property(GMX_LIBGROMACS_SOURCES ${ARGN})
 endfunction ()
 
+# TODO Reconsider this, as the CUDA driver API is probably a simpler
+# approach, at least for the build system. See Redmine #2530
+function (gmx_compile_cpp_as_cuda)
+    _gmx_add_files_to_property(GMX_LIBGROMACS_GPU_IMPL_SOURCES ${ARGN})
+endfunction ()
+
 function (gmx_install_headers)
     if (NOT GMX_BUILD_MDRUN_ONLY)
         file(RELATIVE_PATH _dest ${PROJECT_SOURCE_DIR}/src ${CMAKE_CURRENT_LIST_DIR})
@@ -158,11 +165,18 @@ gmx_configure_version_file(
     REMOTE_HASH)
 list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
 
+# Mark some shared GPU implementation files to compile with CUDA if needed
+if (GMX_USE_CUDA)
+    get_property(LIBGROMACS_GPU_IMPL_SOURCES GLOBAL PROPERTY GMX_LIBGROMACS_GPU_IMPL_SOURCES)
+    set_source_files_properties(${LIBGROMACS_GPU_IMPL_SOURCES} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+endif()
+
 # set up CUDA compilation with clang
 if (GMX_CLANG_CUDA)
     foreach (_file ${LIBGROMACS_SOURCES})
         get_filename_component(_ext ${_file} EXT)
-        if (${_ext} STREQUAL ".cu")
+        get_source_file_property(_cuda_source_format ${_file} CUDA_SOURCE_PROPERTY_FORMAT)
+        if (${_ext} STREQUAL ".cu" OR _cuda_source_format)
             gmx_compile_cuda_file_with_clang(${_file})
         endif()
     endforeach()
index a8f444ce6f216f488b1470f519c4678c065c3687..a95e1ed3674eaf570edceb6597e8fa0d806f2a91 100644 (file)
@@ -64,6 +64,9 @@ if (GMX_USE_CUDA)
         pme-gpu.cpp
         pme-gpu-internal.cpp
         )
+    gmx_compile_cpp_as_cuda(
+        pme-gpu-internal.cpp
+        )
 else()
     gmx_add_libgromacs_sources(
         # Files that implement stubs
index 105f88fad72526e441614b21756680c72d128430..918f74770704ce04de8becb3c4366a2efa145267 100644 (file)
@@ -38,6 +38,9 @@
  * \brief This file contains internal function implementations
  * for performing the PME calculations on GPU.
  *
+ * Note that this file is compiled as regular C++ source in OpenCL builds, but
+ * it is treated as CUDA source in CUDA-enabled GPU builds.
+ *
  * \author Aleksei Iupinov <a.yupinov@gmail.com>
  * \ingroup module_ewald
  */
@@ -53,7 +56,6 @@
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/math/units.h"
-#include "gromacs/mdtypes/commrec.h"
 #include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/logger.h"
index 936a391455ef4c727a533ff9751487b53b76b1f8..55d04169623a7959c19eb94afb072fae64d6823e 100644 (file)
@@ -44,6 +44,8 @@
 #ifndef GMX_EWALD_PME_GPU_TYPES_H
 #define GMX_EWALD_PME_GPU_TYPES_H
 
+#include "gromacs/gpu_utils/devicebuffer.h"
+
 /* What follows is all the PME GPU function arguments,
  * sorted into several device-side structures depending on the update rate.
  * This is GPU agnostic (float3 replaced by float[3], etc.).
@@ -61,7 +63,7 @@ struct PmeGpuConstParams
     float elFactor;
     /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
      * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
-    float *d_virialAndEnergy;
+    DeviceBuffer<float> d_virialAndEnergy;
 };
 
 /*! \internal \brief
@@ -82,11 +84,11 @@ struct PmeGpuGridParams
     /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
     int   complexGridSizePadded[DIM];
 
-    /* Grid pointers */
+    /* Grid arrays */
     /*! \brief Real space grid. */
-    float *d_realGrid;
-    /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same pointer as realGrid. */
-    float *d_fourierGrid;
+    DeviceBuffer<float> d_realGrid;
+    /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same handle as realGrid. */
+    DeviceBuffer<float> d_fourierGrid;
 
     /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
     float ewaldFactor;
@@ -94,17 +96,17 @@ struct PmeGpuGridParams
     /*! \brief Grid spline values as in pme->bsp_mod
      * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
      */
-    float              *d_splineModuli;
+    DeviceBuffer<float> d_splineModuli;
     /*! \brief Offsets for X/Y/Z components of d_splineModuli */
     int                 splineValuesOffset[DIM];
 
     /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
-    float               *d_fractShiftsTable;
+    DeviceBuffer<float> d_fractShiftsTable;
     /*! \brief Gridline indices lookup table
      * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
-    int                *d_gridlineIndicesTable;
+    DeviceBuffer<int> d_gridlineIndicesTable;
     /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
-    int                 tablesOffsets[DIM];
+    int               tablesOffsets[DIM];
 };
 
 /*! \internal \brief
@@ -115,32 +117,32 @@ struct PmeGpuAtomParams
 {
     /*! \brief Number of local atoms */
     int    nAtoms;
-    /*! \brief Pointer to the global GPU memory with input rvec atom coordinates.
+    /*! \brief Global GPU memory array handle with input rvec atom coordinates.
      * The coordinates themselves change and need to be copied to the GPU for every PME computation,
      * but reallocation happens only at DD.
      */
-    float *d_coordinates;
-    /*! \brief Pointer to the global GPU memory with input atom charges.
+    DeviceBuffer<float> d_coordinates;
+    /*! \brief Global GPU memory array handle with input atom charges.
      * The charges only need to be reallocated and copied to the GPU at DD step.
      */
-    float  *d_coefficients;
-    /*! \brief Pointer to the global GPU memory with input/output rvec atom forces.
+    DeviceBuffer<float> d_coefficients;
+    /*! \brief Global GPU memory array handle with input/output rvec atom forces.
      * The forces change and need to be copied from (and possibly to) the GPU for every PME computation,
      * but reallocation happens only at DD.
      */
-    float  *d_forces;
-    /*! \brief Pointer to the global GPU memory with ivec atom gridline indices.
+    DeviceBuffer<float> d_forces;
+    /*! \brief Global GPU memory array handle with ivec atom gridline indices.
      * Computed on GPU in the spline calculation part.
      */
-    int *d_gridlineIndices;
+    DeviceBuffer<int> d_gridlineIndices;
 
     /* B-spline parameters are computed entirely on GPU for every PME computation, not copied.
      * Unless we want to try something like GPU spread + CPU gather?
      */
-    /*! \brief Pointer to the global GPU memory with B-spline values */
-    float  *d_theta;
-    /*! \brief Pointer to the global GPU memory with B-spline derivative values */
-    float  *d_dtheta;
+    /*! \brief Global GPU memory array handle with B-spline values */
+    DeviceBuffer<float> d_theta;
+    /*! \brief Global GPU memory array handle with B-spline derivative values */
+    DeviceBuffer<float> d_dtheta;
 };
 
 /*! \internal \brief
index 9d1d6c7cfe65cb2a2795b29b0ece90b326865d83..415982f319cee043956f651e80022259ecfd07fa 100644 (file)
@@ -55,6 +55,7 @@
 
 #include "pme.cuh"
 #include "pme-3dfft.cuh"
+#include "pme-gpu-program-impl.h"
 #include "pme-grid.h"
 
 int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
@@ -80,16 +81,13 @@ void pme_gpu_synchronize(const PmeGpu *pmeGpu)
 void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
 {
     const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
-    cudaError_t  stat                = cudaMalloc((void **)&pmeGpu->kernelParams->constants.d_virialAndEnergy, energyAndVirialSize);
-    CU_RET_ERR(stat, "cudaMalloc failed on PME energy and virial");
+    allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, pmeGpu->archSpecific->context);
     pmalloc((void **)&pmeGpu->staging.h_virialAndEnergy, energyAndVirialSize);
 }
 
 void pme_gpu_free_energy_virial(PmeGpu *pmeGpu)
 {
-    cudaError_t stat = cudaFree(pmeGpu->kernelParams->constants.d_virialAndEnergy);
-    CU_RET_ERR(stat, "cudaFree failed on PME energy and virial");
-    pmeGpu->kernelParams->constants.d_virialAndEnergy = nullptr;
+    freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
     pfree(pmeGpu->staging.h_virialAndEnergy);
     pmeGpu->staging.h_virialAndEnergy = nullptr;
 }
@@ -114,7 +112,7 @@ void pme_gpu_realloc_and_copy_bspline_values(const PmeGpu *pmeGpu)
         pmeGpu->kernelParams->grid.realGridSize[ZZ];
     const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
     reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
-                           &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->splineValuesSize, &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
     if (shouldRealloc)
     {
         /* Reallocate the host buffer */
@@ -142,7 +140,7 @@ void pme_gpu_realloc_forces(PmeGpu *pmeGpu)
     const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
-                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
     pmeGpu->staging.h_forces.reserve(pmeGpu->nAtomsAlloc);
     pmeGpu->staging.h_forces.resize(pmeGpu->kernelParams->atoms.nAtoms);
 }
@@ -175,7 +173,7 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
     const size_t newCoordinatesSize = pmeGpu->nAtomsAlloc * DIM;
     GMX_ASSERT(newCoordinatesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, newCoordinatesSize,
-                           &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->coordinatesSize, &pmeGpu->archSpecific->coordinatesSizeAlloc, pmeGpu->archSpecific->context);
     if (c_usePadding)
     {
         const size_t paddingIndex = DIM * pmeGpu->kernelParams->atoms.nAtoms;
@@ -213,7 +211,7 @@ void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu *pmeGpu, const flo
     const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
-                           &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->coefficientsSize, &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
     copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, const_cast<float *>(h_coefficients),
                        0, pmeGpu->kernelParams->atoms.nAtoms,
                        pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
@@ -246,9 +244,9 @@ void pme_gpu_realloc_spline_data(const PmeGpu *pmeGpu)
     int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
     int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
-                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->pmeStream);
+                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->context);
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
-                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
     // the host side reallocation
     if (shouldRealloc)
     {
@@ -273,7 +271,7 @@ void pme_gpu_realloc_grid_indices(const PmeGpu *pmeGpu)
     const size_t newIndicesSize = DIM * pmeGpu->nAtomsAlloc;
     GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
     reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
-                           &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                           &pmeGpu->archSpecific->gridlineIndicesSize, &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
     pfree(pmeGpu->staging.h_gridlineIndices);
     pmalloc((void **)&pmeGpu->staging.h_gridlineIndices, newIndicesSize * sizeof(int));
 }
@@ -298,16 +296,16 @@ void pme_gpu_realloc_grids(PmeGpu *pmeGpu)
     {
         /* 2 separate grids */
         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
-                               &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                               &pmeGpu->archSpecific->complexGridSize, &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
-                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
     }
     else
     {
         /* A single buffer so that any grid will fit */
         const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
         reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newGridsSize,
-                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->pmeStream);
+                               &pmeGpu->archSpecific->realGridSize, &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
         kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
         pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
         // the size might get used later for copying the grid
@@ -458,6 +456,9 @@ void pme_gpu_init_internal(PmeGpu *pmeGpu)
     // TODO: Consider turning on by default when we can detect nr of streams.
     pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
 
+    // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
+    pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
+
     // Prepare to use the device that this PME task was assigned earlier.
     CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
 
index 55aca93b5bfeb684c05434f4a896191b2652b212..eafe5f6ddc526cf2938e166f07c59be36f284776 100644 (file)
@@ -209,6 +209,14 @@ struct PmeGpuCuda
     /*! \brief The CUDA stream where everything related to the PME happens. */
     cudaStream_t pmeStream;
 
+    /*! \brief
+     * A handle to the GPU context.
+     * TODO: this is currently extracted from the implementation of pmeGpu->programHandle_,
+     * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram,
+     * managed by high-level code.
+     */
+    Context context;
+
     /* Synchronization events */
     /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
     cudaEvent_t syncSpreadGridD2H;