Consolidate PME GPU spline infrastructure

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 6 Mar 2020 16:30:26 +0000 (17:30 +0100)

committer Artem Zhmurov <zhmurov@gmail.com>

Mon, 9 Mar 2020 22:27:50 +0000 (23:27 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 6 Mar 2020 16:30:26 +0000 (17:30 +0100)
committer Artem Zhmurov <zhmurov@gmail.com>
Mon, 9 Mar 2020 22:27:50 +0000 (23:27 +0100)
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh

index 02b42de15150628eaf25ae7000ed37ac030c677c..10b14dbe09553b783030bb3d8ba235a8ce59d663 100644 (file)
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -50,7 +50,7 @@
   */
  
  #include "pme_gpu_types.h"
-#include "pme_gpu_utils.clh"
+#include "pme_gpu_calculate_splines.clh"
  
  #ifndef COMPILE_GATHER_HELPERS_ONCE
  #    define COMPILE_GATHER_HELPERS_ONCE
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu

index e95e9609e7c2b3042cd1eeba4bf662ab9daabeda..e3d97b98445510ed3efcf586309199ad31421bd9 100644 (file)
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -47,8 +47,7 @@
  #include "gromacs/gpu_utils/typecasts.cuh"
  
  #include "pme.cuh"
-#include "pme_calculate_splines.cuh"
-#include "pme_gpu_utils.h"
+#include "pme_gpu_calculate_splines.cuh"
  #include "pme_grid.h"
  
  /*! \brief
diff --git a/src/gromacs/ewald/pme_gpu_utils.clh b/src/gromacs/ewald/pme_gpu_calculate_splines.clh

similarity index 94%

rename from src/gromacs/ewald/pme_gpu_utils.clh

rename to src/gromacs/ewald/pme_gpu_calculate_splines.clh

index 1cec1a1cee2215bfdb90b5f227b730caf03db60b..8fb056b12b0075fff44cec5b51517aa94e3456fb 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_utils.clh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.clh
@@ -32,12 +32,12 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifndef GMX_EWALD_PME_GPU_UTILS_CLH
-#define GMX_EWALD_PME_GPU_UTILS_CLH
+#ifndef GMX_EWALD_PME_GPU_CALCULATE_SPLINES_CLH
+#define GMX_EWALD_PME_GPU_CALCULATE_SPLINES_CLH
  
  /*! \internal \file
- * \brief This file defines the small PME OpenCL inline device functions.
- * This closely mirrors pme_gpu_utils.h (which is used in CUDA and unit tests), except with no templates.
+ * \brief This file defines the PME OpenCL inline device functions for computing splines.
+ * This closely mirrors pme_gpu_calculate_splines.cuh (which is used in CUDA kernels), except with no templates.
   * Instead of templated parameters this file expects following defines during compilation:
   * - order - PME interpolation order;
   * - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
@@ -110,6 +110,7 @@ inline int pme_gpu_check_atom_data_index(const size_t atomDataIndex, const size_
   */
  inline int pme_gpu_check_atom_charge(const float coefficient)
  {
+    assert(isfinite(coefficient));
      return c_skipNeutralAtoms ? (coefficient != 0.0F) : 1;
  }
  
diff --git a/src/gromacs/ewald/pme_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh

similarity index 80%

rename from src/gromacs/ewald/pme_calculate_splines.cuh

rename to src/gromacs/ewald/pme_gpu_calculate_splines.cuh

index 4e5dc6e8882356a9ba4ad81b83a6068a4b91312b..e305359158bc6aff18b520dc110dc16adc5c7ca4 100644 (file)
--- a/src/gromacs/ewald/pme_calculate_splines.cuh
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh
@@ -46,9 +46,84 @@
  #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  
  #include "pme.cuh"
-#include "pme_gpu_utils.h"
  #include "pme_grid.h"
  
+/*! \internal \brief
+ * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
+ * Feed the result into getSplineParamIndex() to get a full index.
+ * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
+ * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
+ * Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp).
+ *
+ * \tparam order               PME order
+ * \tparam atomsPerWarp        Number of atoms processed by a warp
+ * \param[in] warpIndex        Warp index wrt the block.
+ * \param[in] atomWarpIndex    Atom index wrt the warp (from 0 to atomsPerWarp - 1).
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template<int order, int atomsPerWarp>
+int __device__ __forceinline__ getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
+{
+    assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp));
+    const int dimIndex    = 0;
+    const int splineIndex = 0;
+    // The zeroes are here to preserve the full index formula for reference
+    return (((splineIndex + order * warpIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex);
+}
+
+/*! \internal \brief
+ * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
+ * in range(0, atomsPerBlock * order * DIM).
+ * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
+ *
+ * \tparam order               PME order
+ * \tparam atomsPerWarp        Number of atoms processed by a warp
+ * \param[in] paramIndexBase   Must be result of getSplineParamIndexBase().
+ * \param[in] dimIndex         Dimension index (from 0 to 2)
+ * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template<int order, int atomsPerWarp>
+int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+{
+    assert((dimIndex >= XX) && (dimIndex < DIM));
+    assert((splineIndex >= 0) && (splineIndex < order));
+    return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
+}
+
+/*! \internal \brief
+ * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
+ *
+ * \param[in] atomDataIndex        The atom data index.
+ * \param[in] nAtomData            The atom data array element count.
+ * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
+{
+    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
+}
+
+/*! \internal \brief
+ * An inline CUDA function for skipping the zero-charge atoms.
+ *
+ * \returns                        Non-0 if atom should be processed, 0 otherwise.
+ * \param[in] coefficient          The atom charge.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+{
+    assert(isfinite(coefficient));
+    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+}
+
  //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
  static const bool c_useAtomDataPrefetch = true;
  
diff --git a/src/gromacs/ewald/pme_gpu_utils.h b/src/gromacs/ewald/pme_gpu_calculate_splines.h

similarity index 71%

rename from src/gromacs/ewald/pme_gpu_utils.h

rename to src/gromacs/ewald/pme_gpu_calculate_splines.h

index 9e4ca2a8bc716dd9f1a8a2add27d324e9e374d26..ab87a73c02212f318769874fd11014a56f81091f 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_utils.h
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -45,18 +45,11 @@
   * \ingroup module_ewald
   */
  
-#include "config.h"
-
  #include <cassert>
  
-#include "pme_gpu_constants.h"
+#include "gromacs/math/vectypes.h"
  
-//! A macro for inline GPU functions.
-#if GMX_GPU == GMX_GPU_CUDA
-#    define INLINE_EVERYWHERE __host__ __device__ __forceinline__
-#else
-#    define INLINE_EVERYWHERE inline
-#endif
+struct PmeGpu;
  
  /*! \internal \brief
   * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
@@ -74,7 +67,7 @@
   * \returns Index into theta or dtheta array using GPU layout.
   */
  template<int order, int atomsPerWarp>
-int INLINE_EVERYWHERE getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
+int inline getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
  {
      assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp));
      const int dimIndex    = 0;
@@ -98,45 +91,11 @@ int INLINE_EVERYWHERE getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
   * \returns Index into theta or dtheta array using GPU layout.
   */
  template<int order, int atomsPerWarp>
-int INLINE_EVERYWHERE getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+int inline getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
  {
      assert((dimIndex >= XX) && (dimIndex < DIM));
      assert((splineIndex >= 0) && (splineIndex < order));
      return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
  }
  
-#if GMX_GPU == GMX_GPU_CUDA
-// CUDA device code helpers below
-
-/*! \internal \brief
- * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] atomDataIndex        The atom data index.
- * \param[in] nAtomData            The atom data array element count.
- * \returns                        Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
-{
-    return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
-/*! \internal \brief
- * An inline CUDA function for skipping the zero-charge atoms.
- *
- * \returns                        Non-0 if atom should be processed, 0 otherwise.
- * \param[in] coefficient          The atom charge.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
-{
-    assert(isfinite(coefficient));
-    return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
-}
-
-#endif
-
  #endif
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index 17dd6805c06d2d899583348a94f3468944307fff..bd3b25e1cdf1cfa2cf4e07c5f0fcf166fd5c5fc3 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -77,13 +77,13 @@
  #include "gromacs/ewald/pme.h"
  
  #include "pme_gpu_3dfft.h"
+#include "pme_gpu_calculate_splines.h"
  #include "pme_gpu_constants.h"
  #include "pme_gpu_program_impl.h"
  #include "pme_gpu_timings.h"
  #include "pme_gpu_types.h"
  #include "pme_gpu_types_host.h"
  #include "pme_gpu_types_host_impl.h"
-#include "pme_gpu_utils.h"
  #include "pme_grid.h"
  #include "pme_internal.h"
  #include "pme_solve.h"
diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh

index 0ea1449c22f33e6a81c23215b81bfef7d2af200c..478bc0acdf9ff74cdaaa14382c70e7d3ab3772d6 100644 (file)
--- a/src/gromacs/ewald/pme_spread.clh
+++ b/src/gromacs/ewald/pme_spread.clh
@@ -54,7 +54,7 @@
  #include "gromacs/gpu_utils/vectype_ops.clh"
  
  #include "pme_gpu_types.h"
-#include "pme_gpu_utils.clh"
+#include "pme_gpu_calculate_splines.clh"
  
  /*
   * This define affects the spline calculation behaviour in the kernel.
diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu

index 3fb0ad802fcd0085ecaf0a88b1a6acb136049ae9..287bfaec55bde14c2f3aa00c2851a8ed4b10e2f9 100644 (file)
--- a/src/gromacs/ewald/pme_spread.cu
+++ b/src/gromacs/ewald/pme_spread.cu
@@ -50,8 +50,7 @@
  #include "gromacs/gpu_utils/typecasts.cuh"
  
  #include "pme.cuh"
-#include "pme_calculate_splines.cuh"
-#include "pme_gpu_utils.h"
+#include "pme_gpu_calculate_splines.cuh"
  #include "pme_grid.h"
  
  /*! \brief
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 6 Mar 2020 16:30:26 +0000 (17:30 +0100)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Mon, 9 Mar 2020 22:27:50 +0000 (23:27 +0100)
src/gromacs/ewald/pme_gather.clh		patch \| blob \| history
src/gromacs/ewald/pme_gather.cu		patch \| blob \| history
src/gromacs/ewald/pme_gpu_calculate_splines.clh	[moved from src/gromacs/ewald/pme_gpu_utils.clh with 94% similarity]	patch \| blob \| history
src/gromacs/ewald/pme_gpu_calculate_splines.cuh	[moved from src/gromacs/ewald/pme_calculate_splines.cuh with 80% similarity]	patch \| blob \| history
src/gromacs/ewald/pme_gpu_calculate_splines.h	[moved from src/gromacs/ewald/pme_gpu_utils.h with 71% similarity]	patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_spread.clh		patch \| blob \| history
src/gromacs/ewald/pme_spread.cu		patch \| blob \| history