PME spline+spread CUDA kernel and unit tests

[alexxy/gromacs.git] / src / gromacs / ewald / pme.cuh
diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh

index 16b4b9aae8ae905f433656c9e41daea96b24f156..bed5feb4923a64a4c0c1bab45a9d6ce70739a8da 100644 (file)
--- a/src/gromacs/ewald/pme.cuh
+++ b/src/gromacs/ewald/pme.cuh
@@ -45,6 +45,8 @@
  #ifndef GMX_EWALD_PME_CUH
  #define GMX_EWALD_PME_CUH
  
+#include "config.h"
+
  #include <cassert>
  
  #include <array>
@@ -89,12 +91,23 @@ class GpuParallel3dFft;
   */
  #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
  
-//! The spread/gather integer constant which depends on the templated order parameter (2 atoms per warp for order == 4)
+/*! \brief
+ * The number of atoms processed by a single warp in spread/gather.
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * It is mostly used for spline data layout tweaked for coalesced access.
+ */
  #define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
  
-//! Atom data alignment - has to be divisible both by spread and gather maximal atoms-per-block counts,
-//! which is asserted in case we use atom data padding at all.
-#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP);
+/*! \brief
+ * Atom data alignment (in terms of number of atoms).
+ * If the GPU atom data buffers are padded (c_usePadding == true),
+ * Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
+ * The literal number (16) expresses maximum spread/gather block width in warps.
+ * Accordingly, spread and gather block widths in warps should be divisors of this
+ * (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
+ * There are debug asserts for this divisibility.
+ */
+#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP)
  
  /*! \brief \internal
   * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
@@ -193,10 +206,6 @@ struct pme_gpu_cuda_t
      int splineValuesSize;
      /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
      int splineValuesSizeAlloc;
-    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (actual) */
-    int fractShiftsSize;
-    /*! \brief Both the kernelParams.grid.fshArray and kernelParams.grid.nnArray float element count (reserved) */
-    int fractShiftsSizeAlloc;
      /*! \brief The kernelParams.grid.realGrid float element count (actual) */
      int realGridSize;
      /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
@@ -222,22 +231,16 @@ struct pme_gpu_cuda_kernel_params_t : pme_gpu_kernel_params_base_t
      cudaTextureObject_t gridlineIndicesTableTexture;
  };
  
-/* CUDA texture functions which will reside in respective kernel files
+/* CUDA texture reference functions which reside in respective kernel files
   * (due to texture references having scope of a translation unit).
   */
-
-/*! \brief \internal
- * Creates/binds 2 textures used in the spline parameter computation.
- *
- * \param[in, out] pmeGPU         The PME GPU structure.
- */
-inline void pme_gpu_make_fract_shifts_textures(pme_gpu_t gmx_unused *pmeGpu){};
-
-/*! \brief \internal
- * Frees/unbinds 2 textures used in the spline parameter computation.
- *
- * \param[in] pmeGPU             The PME GPU structure.
- */
-inline void pme_gpu_free_fract_shifts_textures(const pme_gpu_t gmx_unused *pmeGpu){};
+#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
+extern texture<int, 1, cudaReadModeElementType>   gridlineIndicesTableTextureRef;
+extern texture<float, 1, cudaReadModeElementType> fractShiftsTableTextureRef;
+#endif
+/*! Returns the reference to the gridlineIndices texture. */
+texture<int, 1, cudaReadModeElementType>   &pme_gpu_get_gridline_texref();
+/*! Returns the reference to the fractShifts texture. */
+texture<float, 1, cudaReadModeElementType> &pme_gpu_get_fract_shifts_texref();
  
  #endif