*/
#include "pme_gpu_types.h"
-#include "pme_gpu_utils.clh"
+#include "pme_gpu_calculate_splines.clh"
#ifndef COMPILE_GATHER_HELPERS_ONCE
# define COMPILE_GATHER_HELPERS_ONCE
#include "gromacs/gpu_utils/typecasts.cuh"
#include "pme.cuh"
-#include "pme_calculate_splines.cuh"
-#include "pme_gpu_utils.h"
+#include "pme_gpu_calculate_splines.cuh"
#include "pme_grid.h"
/*! \brief
* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
-#ifndef GMX_EWALD_PME_GPU_UTILS_CLH
-#define GMX_EWALD_PME_GPU_UTILS_CLH
+#ifndef GMX_EWALD_PME_GPU_CALCULATE_SPLINES_CLH
+#define GMX_EWALD_PME_GPU_CALCULATE_SPLINES_CLH
/*! \internal \file
- * \brief This file defines the small PME OpenCL inline device functions.
- * This closely mirrors pme_gpu_utils.h (which is used in CUDA and unit tests), except with no templates.
+ * \brief This file defines the PME OpenCL inline device functions for computing splines.
+ * This closely mirrors pme_gpu_calculate_splines.cuh (which is used in CUDA kernels), except with no templates.
* Instead of templated parameters this file expects following defines during compilation:
* - order - PME interpolation order;
* - atomsPerWarp - number of atoms processed by a warp (fixed for spread and gather kernels to be the same);
*/
inline int pme_gpu_check_atom_charge(const float coefficient)
{
+ assert(isfinite(coefficient));
return c_skipNeutralAtoms ? (coefficient != 0.0F) : 1;
}
#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
#include "pme.cuh"
-#include "pme_gpu_utils.h"
#include "pme_grid.h"
+/*! \internal \brief
+ * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
+ * Feed the result into getSplineParamIndex() to get a full index.
+ * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
+ * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
+ * Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp).
+ *
+ * \tparam order PME order
+ * \tparam atomsPerWarp Number of atoms processed by a warp
+ * \param[in] warpIndex Warp index wrt the block.
+ * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to atomsPerWarp - 1).
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template<int order, int atomsPerWarp>
+int __device__ __forceinline__ getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
+{
+ assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp));
+ const int dimIndex = 0;
+ const int splineIndex = 0;
+ // The zeroes are here to preserve the full index formula for reference
+ return (((splineIndex + order * warpIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex);
+}
+
+/*! \internal \brief
+ * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
+ * in range(0, atomsPerBlock * order * DIM).
+ * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
+ *
+ * \tparam order PME order
+ * \tparam atomsPerWarp Number of atoms processed by a warp
+ * \param[in] paramIndexBase Must be result of getSplineParamIndexBase().
+ * \param[in] dimIndex Dimension index (from 0 to 2)
+ * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template<int order, int atomsPerWarp>
+int __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+{
+ assert((dimIndex >= XX) && (dimIndex < DIM));
+ assert((splineIndex >= 0) && (splineIndex < order));
+ return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
+}
+
+/*! \internal \brief
+ * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
+ *
+ * \param[in] atomDataIndex The atom data index.
+ * \param[in] nAtomData The atom data array element count.
+ * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
+{
+ return c_usePadding ? 1 : (atomDataIndex < nAtomData);
+}
+
+/*! \internal \brief
+ * An inline CUDA function for skipping the zero-charge atoms.
+ *
+ * \returns Non-0 if atom should be processed, 0 otherwise.
+ * \param[in] coefficient The atom charge.
+ *
+ * This is called from the spline_and_spread and gather PME kernels.
+ */
+int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
+{
+ assert(isfinite(coefficient));
+ return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
+}
+
//! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
static const bool c_useAtomDataPrefetch = true;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* \ingroup module_ewald
*/
-#include "config.h"
-
#include <cassert>
-#include "pme_gpu_constants.h"
+#include "gromacs/math/vectypes.h"
-//! A macro for inline GPU functions.
-#if GMX_GPU == GMX_GPU_CUDA
-# define INLINE_EVERYWHERE __host__ __device__ __forceinline__
-#else
-# define INLINE_EVERYWHERE inline
-#endif
+struct PmeGpu;
/*! \internal \brief
* Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
* \returns Index into theta or dtheta array using GPU layout.
*/
template<int order, int atomsPerWarp>
-int INLINE_EVERYWHERE getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
+int inline getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
{
assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp));
const int dimIndex = 0;
* \returns Index into theta or dtheta array using GPU layout.
*/
template<int order, int atomsPerWarp>
-int INLINE_EVERYWHERE getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+int inline getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
{
assert((dimIndex >= XX) && (dimIndex < DIM));
assert((splineIndex >= 0) && (splineIndex < order));
return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
}
-#if GMX_GPU == GMX_GPU_CUDA
-// CUDA device code helpers below
-
-/*! \internal \brief
- * An inline CUDA function for checking the global atom data indices against the atom data array sizes.
- *
- * \param[in] atomDataIndex The atom data index.
- * \param[in] nAtomData The atom data array element count.
- * \returns Non-0 if index is within bounds (or PME data padding is enabled), 0 otherwise.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- * The goal is to isolate the global range checks, and allow avoiding them with c_usePadding enabled.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_data_index(const int atomDataIndex, const int nAtomData)
-{
- return c_usePadding ? 1 : (atomDataIndex < nAtomData);
-}
-
-/*! \internal \brief
- * An inline CUDA function for skipping the zero-charge atoms.
- *
- * \returns Non-0 if atom should be processed, 0 otherwise.
- * \param[in] coefficient The atom charge.
- *
- * This is called from the spline_and_spread and gather PME kernels.
- */
-int __device__ __forceinline__ pme_gpu_check_atom_charge(const float coefficient)
-{
- assert(isfinite(coefficient));
- return c_skipNeutralAtoms ? (coefficient != 0.0f) : 1;
-}
-
-#endif
-
#endif
#include "gromacs/ewald/pme.h"
#include "pme_gpu_3dfft.h"
+#include "pme_gpu_calculate_splines.h"
#include "pme_gpu_constants.h"
#include "pme_gpu_program_impl.h"
#include "pme_gpu_timings.h"
#include "pme_gpu_types.h"
#include "pme_gpu_types_host.h"
#include "pme_gpu_types_host_impl.h"
-#include "pme_gpu_utils.h"
#include "pme_grid.h"
#include "pme_internal.h"
#include "pme_solve.h"
#include "gromacs/gpu_utils/vectype_ops.clh"
#include "pme_gpu_types.h"
-#include "pme_gpu_utils.clh"
+#include "pme_gpu_calculate_splines.clh"
/*
* This define affects the spline calculation behaviour in the kernel.
#include "gromacs/gpu_utils/typecasts.cuh"
#include "pme.cuh"
-#include "pme_calculate_splines.cuh"
-#include "pme_gpu_utils.h"
+#include "pme_gpu_calculate_splines.cuh"
#include "pme_grid.h"
/*! \brief