Prepare pme-gpu-types.h for OpenCL

author Aleksei Iupinov <a.yupinov@gmail.com>

Wed, 16 May 2018 12:39:36 +0000 (14:39 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Mon, 27 Aug 2018 10:31:34 +0000 (12:31 +0200)
author Aleksei Iupinov <a.yupinov@gmail.com>
Wed, 16 May 2018 12:39:36 +0000 (14:39 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Mon, 27 Aug 2018 10:31:34 +0000 (12:31 +0200)
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h

index 55d04169623a7959c19eb94afb072fae64d6823e..6fb6cae82726120279bb3a0a6329949fac3be110 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -33,7 +33,7 @@
   * the research papers on the package. Check out http://www.gromacs.org.
   */
  
-/*! \libinternal \file
+/*! \internal \file
   * \brief Defines the PME GPU data structures
   * (the GPU function parameters used both on host and device sides).
   *
@@ -44,7 +44,41 @@
  #ifndef GMX_EWALD_PME_GPU_TYPES_H
  #define GMX_EWALD_PME_GPU_TYPES_H
  
+/*
+ * In OpenCL, the structures must be laid out on the host and device exactly the same way.
+ * If something is off, one might get an error CL_INVALID_ARG_SIZE if any structure's sizes don't match.
+ * What's worse, structures might be of same size but members might be aligned differently,
+ * resulting in wrong kernel results. The structures below are aligned manually.
+ * The pattern is ordering the members of structs from smallest to largest sizeof
+ * (arrays behave the same way as sequences of separate fields),
+ * as described in "The Lost Art of C Structure Packing".
+ *
+ * However, if the need arises at some point, they can all be aligned forcefully:
+ *
+ * #define GMX_GPU_ALIGNED __attribute__ ((aligned(8)))
+ * struct GMX_GPU_ALIGNED PmeGpuConstParams
+ * struct GMX_GPU_ALIGNED PmeGpuGridParams
+ * etc...
+ *
+ * One might also try __attribute__ ((packed)), but it doesn't work with DeviceBuffer,
+ * as it appears to not be POD.
+ */
+
+
+/*! \brief A workaround to hide DeviceBuffer template from OpenCL kernel compilation
+ * - to turn it into a dummy of the same size as host implementation of device buffer.
+ * As we only care about 64-bit, 8 bytes is fine.
+ * TODO: what we should be doing is providing separate device-side views of the same structures -
+ * then there would be no need for macro.
+ */
+#ifndef __OPENCL_C_VERSION__
  #include "gromacs/gpu_utils/devicebuffer.h"
+#define HIDE_FROM_OPENCL_COMPILER(x) x
+static_assert(sizeof(DeviceBuffer<float>) == 8, "DeviceBuffer is defined as an 8 byte stub for OpenCL C");
+static_assert(sizeof(DeviceBuffer<int>) == 8, "DeviceBuffer is defined as an 8 byte stub for OpenCL C");
+#else
+#define HIDE_FROM_OPENCL_COMPILER(x) char8
+#endif
  
  /* What follows is all the PME GPU function arguments,
   * sorted into several device-side structures depending on the update rate.
@@ -63,7 +97,7 @@ struct PmeGpuConstParams
      float elFactor;
      /*! \brief Virial and energy GPU array. Size is PME_GPU_ENERGY_AND_VIRIAL_COUNT (7) floats.
       * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
-    DeviceBuffer<float> d_virialAndEnergy;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_virialAndEnergy;
  };
  
  /*! \internal \brief
@@ -72,6 +106,9 @@ struct PmeGpuConstParams
   */
  struct PmeGpuGridParams
  {
+    /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
+    float ewaldFactor;
+
      /* Grid sizes */
      /*! \brief Real-space grid data dimensions. */
      int   realGridSize[DIM];
@@ -84,29 +121,26 @@ struct PmeGpuGridParams
      /*! \brief Fourier grid dimensions (padded). This counts the complex numbers! */
      int   complexGridSizePadded[DIM];
  
+    /*! \brief Offsets for X/Y/Z components of d_splineModuli */
+    int  splineValuesOffset[DIM];
+    /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
+    int  tablesOffsets[DIM];
+
      /* Grid arrays */
      /*! \brief Real space grid. */
-    DeviceBuffer<float> d_realGrid;
-    /*! \brief Complex grid - used in FFT/solve. If inplace cuFFT is used, then it is the same handle as realGrid. */
-    DeviceBuffer<float> d_fourierGrid;
-
-    /*! \brief Ewald solving factor = (M_PI / pme->ewaldcoeff_q)^2 */
-    float ewaldFactor;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_realGrid;
+    /*! \brief Complex grid - used in FFT/solve. If inplace cu/clFFT is used, then it is the same handle as realGrid. */
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fourierGrid;
  
      /*! \brief Grid spline values as in pme->bsp_mod
       * (laid out sequentially (XXX....XYYY......YZZZ.....Z))
       */
-    DeviceBuffer<float> d_splineModuli;
-    /*! \brief Offsets for X/Y/Z components of d_splineModuli */
-    int                 splineValuesOffset[DIM];
-
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_splineModuli;
      /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
-    DeviceBuffer<float> d_fractShiftsTable;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fractShiftsTable;
      /*! \brief Gridline indices lookup table
       * (modulo lookup table as in pme->nnx/nny/nnz, laid out sequentially (XXX....XYYY......YZZZ.....Z)) */
-    DeviceBuffer<int> d_gridlineIndicesTable;
-    /*! \brief Offsets for X/Y/Z components of d_fractShiftsTable and d_gridlineIndicesTable */
-    int               tablesOffsets[DIM];
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<int>) d_gridlineIndicesTable;
  };
  
  /*! \internal \brief
@@ -121,28 +155,27 @@ struct PmeGpuAtomParams
       * The coordinates themselves change and need to be copied to the GPU for every PME computation,
       * but reallocation happens only at DD.
       */
-    DeviceBuffer<float> d_coordinates;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coordinates;
      /*! \brief Global GPU memory array handle with input atom charges.
       * The charges only need to be reallocated and copied to the GPU at DD step.
       */
-    DeviceBuffer<float> d_coefficients;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coefficients;
      /*! \brief Global GPU memory array handle with input/output rvec atom forces.
       * The forces change and need to be copied from (and possibly to) the GPU for every PME computation,
       * but reallocation happens only at DD.
       */
-    DeviceBuffer<float> d_forces;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_forces;
      /*! \brief Global GPU memory array handle with ivec atom gridline indices.
       * Computed on GPU in the spline calculation part.
       */
-    DeviceBuffer<int> d_gridlineIndices;
-
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<int>) d_gridlineIndices;
      /* B-spline parameters are computed entirely on GPU for every PME computation, not copied.
       * Unless we want to try something like GPU spread + CPU gather?
       */
      /*! \brief Global GPU memory array handle with B-spline values */
-    DeviceBuffer<float> d_theta;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_theta;
      /*! \brief Global GPU memory array handle with B-spline derivative values */
-    DeviceBuffer<float> d_dtheta;
+    HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_dtheta;
  };
  
  /*! \internal \brief
@@ -173,16 +206,16 @@ struct PmeGpuDynamicParams
  struct PmeGpuKernelParamsBase
  {
      /*! \brief Constant data that is set once. */
-    PmeGpuConstParams   constants;
+    struct PmeGpuConstParams   constants;
      /*! \brief Data dependent on the grid size/cutoff. */
-    PmeGpuGridParams    grid;
+    struct PmeGpuGridParams    grid;
      /*! \brief Data dependent on the DD and local atoms. */
-    PmeGpuAtomParams    atoms;
+    struct PmeGpuAtomParams    atoms;
      /*! \brief Data that possibly changes for every new PME computation.
       * This should be kept up-to-date by calling pme_gpu_prepare_computation(...)
       * before launching spreading.
       */
-    PmeGpuDynamicParams current;
+    struct PmeGpuDynamicParams current;
  };
  
  #endif
author	Aleksei Iupinov <a.yupinov@gmail.com>
	Wed, 16 May 2018 12:39:36 +0000 (14:39 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 27 Aug 2018 10:31:34 +0000 (12:31 +0200)