Compile most of PME GPU host code with OpenCL

author Aleksei Iupinov <a.yupinov@gmail.com>

Fri, 18 May 2018 14:16:37 +0000 (16:16 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 7 Jun 2018 11:17:59 +0000 (13:17 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Fri, 18 May 2018 14:16:37 +0000 (16:16 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 7 Jun 2018 11:17:59 +0000 (13:17 +0200)
diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt

index 1754489e56a1cc5d19df9a7615ad81a76b79bae2..7c0b32891763bd99750379369bd6b206a0152113 100644 (file)
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -71,6 +71,10 @@ elseif (GMX_USE_OPENCL)
      gmx_add_libgromacs_sources(
          # OpenCL-specific sources
          pme-gpu-3dfft-ocl.cpp
+        # GPU-specific sources
+        pme-gpu.cpp
+        pme-gpu-internal.cpp
+        pme-gpu-timings.cpp
          # Files that implement stubs
          pme-gpu-program-impl.cpp
          )
diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu

index 5741a5b3b231335a3dbe1acc279334c0acfcfe03..6a5249b15199b1efea7f889424f43c5be471c21f 100644 (file)
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -50,6 +50,7 @@
  #include "pme.cuh"
  #include "pme-gpu-program-impl.h"
  #include "pme-gpu-timings.h"
+#include "pme-gpu-utils.h"
  
  /*! \brief
   * An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
diff --git a/src/gromacs/ewald/pme-gpu-constants.h b/src/gromacs/ewald/pme-gpu-constants.h

index 88ddcff3678b2e9045497a42d1c3412cb0831b7d..1e841d904ebee29ea76774bda18759cbb3b2c57e 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-constants.h
+++ b/src/gromacs/ewald/pme-gpu-constants.h
@@ -55,6 +55,9 @@
  
  #if GMX_GPU == GMX_GPU_CUDA
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#else
+#define warp_size 32 // FIXME remove this and rework macros
+#define PME_SPREADGATHER_ATOMS_PER_WARP 2
  #endif
  
  /* General settings for PME GPU behaviour */
@@ -120,15 +123,9 @@ constexpr int c_virialAndEnergyCount = 7;
   */
  #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
  
-/*! \brief
- * The number of atoms processed by a single warp in spread/gather.
- * This macro depends on the templated order parameter (2 atoms per warp for order 4).
- * It is mostly used for spline data layout tweaked for coalesced access.
- */
-#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
-
  /*! \brief
   * Atom data alignment (in terms of number of atoms).
+ * The value is (16 * PME_SPREADGATHER_ATOMS_PER_WARP).
   * If the GPU atom data buffers are padded (c_usePadding == true),
   * Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
   * The literal number (16) expresses maximum spread/gather block width in warps.
@@ -136,15 +133,22 @@ constexpr int c_virialAndEnergyCount = 7;
   * (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
   * There are debug asserts for this divisibility.
   */
-#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP)
-
-
+#define PME_ATOM_DATA_ALIGNMENT 32
  
  /*
   * The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
   * TODO: adjust those for OpenCL.
   */
  
+#if GMX_GPU == GMX_GPU_CUDA
+
+/*! \brief
+ * The number of atoms processed by a single warp in spread/gather.
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * It is mostly used for spline data layout tweaked for coalesced access.
+ */
+#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
+
  //! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
  constexpr int c_spreadMaxWarpsPerBlock = 8;
  /* TODO: it has been observed that the kernel can be faster with smaller block sizes (2 or 4 warps)
@@ -168,5 +172,6 @@ constexpr int c_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * warp_size;
  //! Gathering min blocks per CUDA multiprocessor - for CC2.x, we just take the CUDA limit of 8 to avoid the warning
  constexpr int c_gatherMinBlocksPerMP = (GMX_PTX_ARCH < 300) ? GMX_CUDA_MAX_BLOCKS_PER_MP : (GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock);
  
+#endif // GMX_GPU == GMX_GPU_CUDA
  
  #endif
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index eac4cfc62d8de90a03996ff75ad208e5458e5b66..eb515631f11895c6fd6fd5b16f8d477ccf99e855 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -60,6 +60,7 @@
  #include "gromacs/math/units.h"
  #include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/stringutil.h"
@@ -68,6 +69,8 @@
  #include "gromacs/gpu_utils/pmalloc_cuda.h"
  
  #include "pme.cuh"
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gmxopencl.h"
  #endif
  
  #include "pme-gpu-3dfft.h"
@@ -77,6 +80,7 @@
  #include "pme-gpu-types.h"
  #include "pme-gpu-types-host.h"
  #include "pme-gpu-types-host-impl.h"
+#include "pme-gpu-utils.h"
  #include "pme-grid.h"
  #include "pme-internal.h"
  
@@ -93,24 +97,27 @@ static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu *
      return kernelParamsPtr;
  }
  
-int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
+int pme_gpu_get_atom_data_alignment(const PmeGpu *)
  {
-    const int order = pmeGpu->common->pme_order;
-    GMX_ASSERT(order > 0, "Invalid PME order");
+    //TODO: this can be simplified, as PME_ATOM_DATA_ALIGNMENT is now constant
      return PME_ATOM_DATA_ALIGNMENT;
  }
  
  int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
  {
+#if GMX_GPU == GMX_GPU_CUDA
      const int order = pmeGpu->common->pme_order;
      GMX_ASSERT(order > 0, "Invalid PME order");
      return PME_SPREADGATHER_ATOMS_PER_WARP;
+#else
+    GMX_THROW(gmx::NotImplementedError("Atom alignment per warp has to be deduced dynamically for OpenCL"));
+    GMX_UNUSED_VALUE(pmeGpu);
+#endif
  }
  
  void pme_gpu_synchronize(const PmeGpu *pmeGpu)
  {
-    cudaError_t stat = cudaStreamSynchronize(pmeGpu->archSpecific->pmeStream);
-    CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
+    gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
  }
  
  void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
@@ -378,6 +385,7 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu)
  
      const int    newFractShiftsSize  = cellCount * (nx + ny + nz);
  
+#if GMX_GPU == GMX_GPU_CUDA
      initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
                           kernelParamsPtr->fractShiftsTableTexture,
                           pmeGpu->common->fsh.data(),
@@ -389,17 +397,33 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu *pmeGpu)
                           pmeGpu->common->nn.data(),
                           newFractShiftsSize,
                           pmeGpu->deviceInfo);
+#elif GMX_GPU == GMX_GPU_OPENCL
+    // No dedicated texture routines....
+    allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize, pmeGpu->archSpecific->context);
+    allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize, pmeGpu->archSpecific->context);
+    copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(),
+                       0, newFractShiftsSize,
+                       pmeGpu->archSpecific->pmeStream, GpuApiCallBehavior::Async, nullptr);
+    copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(),
+                       0, newFractShiftsSize,
+                       pmeGpu->archSpecific->pmeStream, GpuApiCallBehavior::Async, nullptr);
+#endif
  }
  
  void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
  {
      auto *kernelParamsPtr = pmeGpu->kernelParams.get();
+#if GMX_GPU == GMX_GPU_CUDA
      destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
                              kernelParamsPtr->fractShiftsTableTexture,
                              pmeGpu->deviceInfo);
      destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
                              kernelParamsPtr->gridlineIndicesTableTexture,
                              pmeGpu->deviceInfo);
+#elif GMX_GPU == GMX_GPU_OPENCL
+    freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
+    freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
+#endif
  }
  
  bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
@@ -483,21 +507,39 @@ void pme_gpu_init_internal(PmeGpu *pmeGpu)
       * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
       */
  
-    /* WARNING: CUDA timings are incorrect with multiple streams.
-     *          This is the main reason why they are disabled by default.
-     */
-    // TODO: Consider turning on by default when we can detect nr of streams.
-    pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
+    if (GMX_GPU == GMX_GPU_CUDA)
+    {
+        /* WARNING: CUDA timings are incorrect with multiple streams.
+         *          This is the main reason why they are disabled by default.
+         */
+        // TODO: Consider turning on by default when we can detect nr of streams.
+        pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+    }
+    else if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+    }
  
      // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
      pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
  
+#if GMX_GPU == GMX_GPU_CUDA
      // Prepare to use the device that this PME task was assigned earlier.
      CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
+#endif
  
+#if GMX_GPU == GMX_GPU_CUDA
      pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+#elif GMX_GPU == GMX_GPU_OPENCL
+    //TODO we'll need work size checks for OpenCL too
+#endif
  
-    /* Creating a PME CUDA stream */
+    /* Creating a PME GPU stream:
+     * - default high priority with CUDA
+     * - no priorities implemented yet with OpenCL; see #2532
+     */
+#if GMX_GPU == GMX_GPU_CUDA
      cudaError_t stat;
      int         highest_priority, lowest_priority;
      stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
@@ -506,13 +548,32 @@ void pme_gpu_init_internal(PmeGpu *pmeGpu)
                                          cudaStreamDefault, //cudaStreamNonBlocking,
                                          highest_priority);
      CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
+#elif GMX_GPU == GMX_GPU_OPENCL
+    cl_command_queue_properties queueProperties = pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
+    cl_device_id                device_id       = pmeGpu->deviceInfo->ocl_gpu_id.ocl_device_id;
+    cl_int                      clError;
+    pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(pmeGpu->archSpecific->context,
+                                                           device_id, queueProperties, &clError);
+    if (clError != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
+    }
+#endif
  }
  
  void pme_gpu_destroy_specific(const PmeGpu *pmeGpu)
  {
+#if GMX_GPU == GMX_GPU_CUDA
      /* Destroy the CUDA stream */
      cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
      CU_RET_ERR(stat, "PME cudaStreamDestroy error");
+#elif GMX_GPU == GMX_GPU_OPENCL
+    cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
+    if (clError != CL_SUCCESS)
+    {
+        gmx_warning("Failed to destroy PME command queue");
+    }
+#endif
  }
  
  void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index aeaa75503a717c0d9b93bcd45ba51d0093f47f20..97bfa98429100a52b114a26991314621044d313f 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -47,7 +47,7 @@
  #define GMX_EWALD_PME_GPU_INTERNAL_H
  
  #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
-#include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
+#include "gromacs/gpu_utils/gpu_macros.h"      // for the GPU_FUNC_ macros
  #include "gromacs/utility/arrayref.h"
  
  #include "pme-gpu-types-host.h"                     // for the inline functions accessing PmeGpu members
@@ -100,7 +100,7 @@ int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu);
   *
   * \param[in] pmeGpu            The PME GPU structure.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Allocates the fixed size energy and virial buffer both on GPU and CPU.
@@ -193,8 +193,8 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu);
   *
   * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                                        const rvec   *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                       const rvec   *GPU_FUNC_ARGUMENT(h_coordinates)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Frees the coordinates on the GPU.
@@ -537,7 +537,7 @@ inline bool pme_gpu_is_testing(const PmeGpu *pmeGpu)
   * \param[in] pmeGpu             The PME GPU structure.
   * \returns                      The input/output forces.
   */
-CUDA_FUNC_QUALIFIER gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(gmx::EmptyArrayRef())
+GPU_FUNC_QUALIFIER gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(gmx::EmptyArrayRef())
  
  /*! \libinternal \brief
   * Returns the output virial and energy of the PME solving.
@@ -546,9 +546,9 @@ CUDA_FUNC_QUALIFIER gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *CUDA_FUN
   * \param[out] energy            The output energy.
   * \param[out] virial            The output virial matrix.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_get_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                                   real *CUDA_FUNC_ARGUMENT(energy),
-                                                   matrix CUDA_FUNC_ARGUMENT(virial)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_get_energy_virial(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                  real *GPU_FUNC_ARGUMENT(energy),
+                                                  matrix GPU_FUNC_ARGUMENT(virial)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
@@ -556,8 +556,8 @@ CUDA_FUNC_QUALIFIER void pme_gpu_get_energy_virial(const PmeGpu *CUDA_FUNC_ARGUM
   * \param[in] pmeGpu         The PME GPU structure.
   * \param[in] box            The unit cell box.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                                  const matrix CUDA_FUNC_ARGUMENT(box)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                 const matrix GPU_FUNC_ARGUMENT(box)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
@@ -588,11 +588,11 @@ enum class PmeLayoutTransform
   * \param[in]  dimIndex   Dimension index.
   * \param[in]  transform  Layout transform type
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                                            const pme_atomcomm_t *CUDA_FUNC_ARGUMENT(atc),
-                                                            PmeSplineDataType CUDA_FUNC_ARGUMENT(type),
-                                                            int CUDA_FUNC_ARGUMENT(dimIndex),
-                                                            PmeLayoutTransform CUDA_FUNC_ARGUMENT(transform)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                           const pme_atomcomm_t *GPU_FUNC_ARGUMENT(atc),
+                                                           PmeSplineDataType GPU_FUNC_ARGUMENT(type),
+                                                           int GPU_FUNC_ARGUMENT(dimIndex),
+                                                           PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
@@ -620,9 +620,9 @@ int getSplineParamFullIndex(int order,
   * \param[out] gridSize          Pointer to the grid dimensions to fill in.
   * \param[out] paddedGridSize    Pointer to the padded grid dimensions to fill in.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                                     gmx::IVec *CUDA_FUNC_ARGUMENT(gridSize),
-                                                     gmx::IVec *CUDA_FUNC_ARGUMENT(paddedGridSize)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                    gmx::IVec *GPU_FUNC_ARGUMENT(gridSize),
+                                                    gmx::IVec *GPU_FUNC_ARGUMENT(paddedGridSize)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
@@ -632,16 +632,16 @@ CUDA_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *CUDA_FUNC_ARG
   * \param[in]     pmeGpuProgram   The PME GPU program data
   * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
-                                        gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo),
-                                        PmeGpuProgramHandle CUDA_FUNC_ARGUMENT(pmeGpuProgram)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+                                       gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
+                                       PmeGpuProgramHandle GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Destroys the PME GPU data at the end of the run.
   *
   * \param[in] pmeGpu     The PME GPU structure.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM
  
  /*! \libinternal \brief
   * Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
@@ -653,9 +653,9 @@ CUDA_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUD
   * This is a function that should only be called in the beginning of the run and on domain decomposition.
   * Should be called before the pme_gpu_set_io_ranges.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                              const int         CUDA_FUNC_ARGUMENT(nAtoms),
-                                              const real       *CUDA_FUNC_ARGUMENT(charges)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+                                             const int         GPU_FUNC_ARGUMENT(nAtoms),
+                                             const real       *GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM
  
  /*! \brief \libinternal
   * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
diff --git a/src/gromacs/ewald/pme-gpu-program.h b/src/gromacs/ewald/pme-gpu-program.h

index aa54a36d69da78f6fc7b2f35d6b745ff204bc51e..0cd8b1a747f0f27bbbc240c435e132c41c9c1b9a 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-program.h
+++ b/src/gromacs/ewald/pme-gpu-program.h
@@ -73,7 +73,6 @@ using PmeGpuProgramHandle = const PmeGpuProgram *;
  
  /*! \brief
   * Factory function used to build persistent PME GPU program for the device at once.
- * \todo This should shortly become GPU_FUNC to support OpenCL.
   */
  PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *);
  
diff --git a/src/gromacs/ewald/pme-gpu-types-host.h b/src/gromacs/ewald/pme-gpu-types-host.h

index 8335fe8ec8f7ed3169aa927fb0768e9f8b90c2d8..a372d324309fc5cb684b1c575758a5f2fef81345 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types-host.h
+++ b/src/gromacs/ewald/pme-gpu-types-host.h
@@ -70,12 +70,13 @@ typedef int PmeGpuSpecific;
  struct PmeGpuCudaKernelParams;
  /*! \brief A typedef for including the GPU kernel arguments data by pointer */
  typedef PmeGpuCudaKernelParams PmeGpuKernelParams;
-
+#elif GMX_GPU == GMX_GPU_OPENCL
+struct PmeGpuKernelParamsBase;
+/*! \brief A typedef for including the GPU kernel arguments data by pointer */
+typedef PmeGpuKernelParamsBase PmeGpuKernelParams;
  #else
-
  /*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
  typedef int PmeGpuKernelParams;
-
  #endif
  
  struct gmx_device_info_t;
diff --git a/src/gromacs/ewald/pme-gpu-utils.h b/src/gromacs/ewald/pme-gpu-utils.h

new file mode 100644 (file)

index 0000000..bd8b36f
--- /dev/null
+++ b/src/gromacs/ewald/pme-gpu-utils.h
@@ -0,0 +1,103 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_PME_GPU_UTILS_H
+#define GMX_EWALD_PME_GPU_UTILS_H
+
+/*! \internal \file
+ * \brief This file defines the small PME GPU inline host/device functions.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "config.h"
+
+#include <cassert>
+
+#include "pme-gpu-constants.h"
+
+//! A macro for inline GPU functions.
+#if GMX_GPU == GMX_GPU_CUDA
+#define INLINE_EVERYWHERE __host__ __device__ __forceinline__
+#else
+#define INLINE_EVERYWHERE inline
+#endif
+
+/*! \internal \brief
+ * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
+ * Feed the result into getSplineParamIndex() to get a full index.
+ * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
+ * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
+ * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
+ *
+ * \tparam order               PME order
+ * \param[in] warpIndex        Warp index wrt the block.
+ * \param[in] atomWarpIndex    Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1).
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template <int order>
+int INLINE_EVERYWHERE getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
+{
+    assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP));
+    const int dimIndex    = 0;
+    const int splineIndex = 0;
+    // The zeroes are here to preserve the full index formula for reference
+    return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+}
+
+/*! \internal \brief
+ * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
+ * in range(0, atomsPerBlock * order * DIM).
+ * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
+ *
+ * \tparam order               PME order
+ * \param[in] paramIndexBase   Must be result of getSplineParamIndexBase().
+ * \param[in] dimIndex         Dimension index (from 0 to 2)
+ * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template <int order>
+int INLINE_EVERYWHERE getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+{
+    assert((dimIndex >= XX) && (dimIndex < DIM));
+    assert((splineIndex >= 0) && (splineIndex < order));
+    return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP);
+}
+
+#endif
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu

index 8e09892a2bd2743f12ee5cb3a36544df8b53751e..39a742cbfdd4362c4b2dfad76ef872a18a69eade 100644 (file)
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -55,6 +55,7 @@
  #include "pme.cuh"
  #include "pme-gpu-program-impl.h"
  #include "pme-gpu-timings.h"
+#include "pme-gpu-utils.h"
  #include "pme-grid.h"
  
  /*
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index 80dc011b07f2cf54421a37db034eabdb3326a437..e2204f2866dd151b2638510dd15054209ea656f9 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -53,51 +53,6 @@
  #include "pme-gpu-types-host.h"
  #include "pme-gpu-types-host-impl.h"
  
-/*! \internal \brief
- * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
- * Feed the result into getSplineParamIndex() to get a full index.
- * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
- * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
- * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
- *
- * \tparam    order            PME order
- * \param[in] warpIndex        Warp index wrt the block.
- * \param[in] atomWarpIndex    Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1).
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-template <int order>
-int __host__ __device__ __forceinline__ getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
-{
-    assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP));
-    const int dimIndex    = 0;
-    const int splineIndex = 0;
-    // The zeroes are here to preserve the full index formula for reference
-    return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
-}
-
-/*! \internal \brief
- * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
- * in range(0, atomsPerBlock * order * DIM).
- * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
- *
- * \tparam    order            PME order
- * \param[in] paramIndexBase   Must be result of getSplineParamIndexBase().
- * \param[in] dimIndex         Dimension index (from 0 to 2)
- * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-template <int order>
-int __host__ __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
-{
-    assert((dimIndex >= XX) && (dimIndex < DIM));
-    assert((splineIndex >= 0) && (splineIndex < order));
-    return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP);
-}
-
  /*! \brief \internal
   * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
   *
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 2b5a4925571124b867350eaefeb1fa00aefdbd57..828597f16eadcab73e127c5f58592a5e4538f2da 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -296,7 +296,7 @@ inline bool pme_gpu_task_enabled(const gmx_pme_t *pme)
   *
   * \param[in] pme            The PME structure.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reset_timings(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM
  
  /*! \brief
   * Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.
@@ -304,8 +304,8 @@ CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const gmx_pme_t *CUDA_FUNC_ARGUME
   * \param[in] pme               The PME structure.
   * \param[in] timings           The gmx_wallclock_gpu_pme_t structure.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const gmx_pme_t         *CUDA_FUNC_ARGUMENT(pme),
-                                             gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_get_timings(const gmx_pme_t         *GPU_FUNC_ARGUMENT(pme),
+                                            gmx_wallclock_gpu_pme_t *GPU_FUNC_ARGUMENT(timings)) GPU_FUNC_TERM
  
  /* The main PME GPU functions */
  
@@ -318,11 +318,11 @@ CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const gmx_pme_t         *CUDA_FUNC_
   * \param[in] flags             The combination of flags to affect this PME computation.
   *                              The flags are the GMX_PME_ flags from pme.h.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *CUDA_FUNC_ARGUMENT(pme),
-                                                     bool            CUDA_FUNC_ARGUMENT(needToUpdateBox),
-                                                     const matrix    CUDA_FUNC_ARGUMENT(box),
-                                                     gmx_wallcycle  *CUDA_FUNC_ARGUMENT(wcycle),
-                                                     int             CUDA_FUNC_ARGUMENT(flags)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
+                                                    bool            GPU_FUNC_ARGUMENT(needToUpdateBox),
+                                                    const matrix    GPU_FUNC_ARGUMENT(box),
+                                                    gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle),
+                                                    int             GPU_FUNC_ARGUMENT(flags)) GPU_FUNC_TERM
  
  /*! \brief
   * Launches first stage of PME on GPU - H2D input transfers, spreading kernel, and D2H grid transfer if needed.
@@ -331,9 +331,9 @@ CUDA_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *CUDA_FUNC_A
   * \param[in] x                 The array of local atoms' coordinates.
   * \param[in] wcycle            The wallclock counter.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *CUDA_FUNC_ARGUMENT(pme),
-                                               const rvec     *CUDA_FUNC_ARGUMENT(x),
-                                               gmx_wallcycle  *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
+                                              const rvec     *GPU_FUNC_ARGUMENT(x),
+                                              gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
@@ -341,8 +341,8 @@ CUDA_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *CUDA_FUNC_ARGUMEN
   * \param[in] pme               The PME data structure.
   * \param[in] wcycle            The wallclock counter.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t       *CUDA_FUNC_ARGUMENT(pme),
-                                                           gmx_wallcycle   *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t       *GPU_FUNC_ARGUMENT(pme),
+                                                          gmx_wallcycle   *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
  
  /*! \brief
   * Launches last stage of PME on GPU - force gathering and D2H force transfer.
@@ -353,9 +353,9 @@ CUDA_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t       *CUDA
   *                               the output reciprocal forces into the host array, or copies its contents to the GPU first
   *                               and accumulates. The reduction is non-atomic.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t        *CUDA_FUNC_ARGUMENT(pme),
-                                               gmx_wallcycle          *CUDA_FUNC_ARGUMENT(wcycle),
-                                               PmeForceOutputHandling  CUDA_FUNC_ARGUMENT(forceTreatment)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t        *GPU_FUNC_ARGUMENT(pme),
+                                              gmx_wallcycle          *GPU_FUNC_ARGUMENT(wcycle),
+                                              PmeForceOutputHandling  GPU_FUNC_ARGUMENT(forceTreatment)) GPU_FUNC_TERM
  
  /*! \brief
   * Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
@@ -367,11 +367,11 @@ CUDA_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t        *CUDA_FUNC
   * \param[out] virial         The output virial matrix.
   * \param[out] energy         The output energy.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_wait_finish_task(const gmx_pme_t                *CUDA_FUNC_ARGUMENT(pme),
-                                                  gmx_wallcycle                  *CUDA_FUNC_ARGUMENT(wcycle),
-                                                  gmx::ArrayRef<const gmx::RVec> *CUDA_FUNC_ARGUMENT(forces),
-                                                  matrix                          CUDA_FUNC_ARGUMENT(virial),
-                                                  real                           *CUDA_FUNC_ARGUMENT(energy)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_wait_finish_task(const gmx_pme_t                *GPU_FUNC_ARGUMENT(pme),
+                                                 gmx_wallcycle                  *GPU_FUNC_ARGUMENT(wcycle),
+                                                 gmx::ArrayRef<const gmx::RVec> *GPU_FUNC_ARGUMENT(forces),
+                                                 matrix                          GPU_FUNC_ARGUMENT(virial),
+                                                 real                           *GPU_FUNC_ARGUMENT(energy)) GPU_FUNC_TERM
  /*! \brief
   * Attempts to complete PME GPU tasks.
   *
@@ -393,12 +393,12 @@ CUDA_FUNC_QUALIFIER void pme_gpu_wait_finish_task(const gmx_pme_t
   * \param[in]  completionKind  Indicates whether PME task completion should only be checked rather than waited for
   * \returns                   True if the PME GPU tasks have completed
   */
-CUDA_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t                *CUDA_FUNC_ARGUMENT(pme),
-                                                 gmx_wallcycle                  *CUDA_FUNC_ARGUMENT(wcycle),
-                                                 gmx::ArrayRef<const gmx::RVec> *CUDA_FUNC_ARGUMENT(forces),
-                                                 matrix                          CUDA_FUNC_ARGUMENT(virial),
-                                                 real                           *CUDA_FUNC_ARGUMENT(energy),
-                                                 GpuTaskCompletion               CUDA_FUNC_ARGUMENT(completionKind)) CUDA_FUNC_TERM_WITH_RETURN(false)
+GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t                *GPU_FUNC_ARGUMENT(pme),
+                                                gmx_wallcycle                  *GPU_FUNC_ARGUMENT(wcycle),
+                                                gmx::ArrayRef<const gmx::RVec> *GPU_FUNC_ARGUMENT(forces),
+                                                matrix                          GPU_FUNC_ARGUMENT(virial),
+                                                real                           *GPU_FUNC_ARGUMENT(energy),
+                                                GpuTaskCompletion               GPU_FUNC_ARGUMENT(completionKind)) GPU_FUNC_TERM_WITH_RETURN(false)
  
  /*! \brief
   * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
@@ -413,7 +413,7 @@ CUDA_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t
   * \param[in] pme            The PME data structure.
   * \param[in] wcycle         The wallclock counter.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
-                                                    gmx_wallcycle   *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+                                                   gmx_wallcycle   *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
  
  #endif
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Fri, 18 May 2018 14:16:37 +0000 (16:16 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 7 Jun 2018 11:17:59 +0000 (13:17 +0200)
src/gromacs/ewald/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/pme-gather.cu		patch \| blob \| history
src/gromacs/ewald/pme-gpu-constants.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-program.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-types-host.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-utils.h	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/pme-spread.cu		patch \| blob \| history
src/gromacs/ewald/pme.cuh		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history