gmx_add_libgromacs_sources(
# OpenCL-specific sources
pme-gpu-3dfft-ocl.cpp
+ # GPU-specific sources
+ pme-gpu.cpp
+ pme-gpu-internal.cpp
+ pme-gpu-timings.cpp
# Files that implement stubs
pme-gpu-program-impl.cpp
)
#include "pme.cuh"
#include "pme-gpu-program-impl.h"
#include "pme-gpu-timings.h"
+#include "pme-gpu-utils.h"
/*! \brief
* An inline CUDA function: unroll the dynamic index accesses to the constant grid sizes to avoid local memory operations.
#if GMX_GPU == GMX_GPU_CUDA
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#else
+#define warp_size 32 // FIXME remove this and rework macros
+#define PME_SPREADGATHER_ATOMS_PER_WARP 2
#endif
/* General settings for PME GPU behaviour */
*/
#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
-/*! \brief
- * The number of atoms processed by a single warp in spread/gather.
- * This macro depends on the templated order parameter (2 atoms per warp for order 4).
- * It is mostly used for spline data layout tweaked for coalesced access.
- */
-#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
-
/*! \brief
* Atom data alignment (in terms of number of atoms).
+ * The value is (16 * PME_SPREADGATHER_ATOMS_PER_WARP).
* If the GPU atom data buffers are padded (c_usePadding == true),
* Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
* The literal number (16) expresses maximum spread/gather block width in warps.
* (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
* There are debug asserts for this divisibility.
*/
-#define PME_ATOM_DATA_ALIGNMENT (16 * PME_SPREADGATHER_ATOMS_PER_WARP)
-
-
+#define PME_ATOM_DATA_ALIGNMENT 32
/*
* The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
* TODO: adjust those for OpenCL.
*/
+#if GMX_GPU == GMX_GPU_CUDA
+
+/*! \brief
+ * The number of atoms processed by a single warp in spread/gather.
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * It is mostly used for spline data layout tweaked for coalesced access.
+ */
+#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
+
//! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
constexpr int c_spreadMaxWarpsPerBlock = 8;
/* TODO: it has been observed that the kernel can be faster with smaller block sizes (2 or 4 warps)
//! Gathering min blocks per CUDA multiprocessor - for CC2.x, we just take the CUDA limit of 8 to avoid the warning
constexpr int c_gatherMinBlocksPerMP = (GMX_PTX_ARCH < 300) ? GMX_CUDA_MAX_BLOCKS_PER_MP : (GMX_CUDA_MAX_THREADS_PER_MP / c_gatherMaxThreadsPerBlock);
+#endif // GMX_GPU == GMX_GPU_CUDA
#endif
#include "gromacs/math/units.h"
#include "gromacs/timing/gpu_timing.h"
#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/gpu_utils/pmalloc_cuda.h"
#include "pme.cuh"
+#elif GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/gmxopencl.h"
#endif
#include "pme-gpu-3dfft.h"
#include "pme-gpu-types.h"
#include "pme-gpu-types-host.h"
#include "pme-gpu-types-host-impl.h"
+#include "pme-gpu-utils.h"
#include "pme-grid.h"
#include "pme-internal.h"
return kernelParamsPtr;
}
-int pme_gpu_get_atom_data_alignment(const PmeGpu *pmeGpu)
+int pme_gpu_get_atom_data_alignment(const PmeGpu *)
{
- const int order = pmeGpu->common->pme_order;
- GMX_ASSERT(order > 0, "Invalid PME order");
+ //TODO: this can be simplified, as PME_ATOM_DATA_ALIGNMENT is now constant
return PME_ATOM_DATA_ALIGNMENT;
}
int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
{
+#if GMX_GPU == GMX_GPU_CUDA
const int order = pmeGpu->common->pme_order;
GMX_ASSERT(order > 0, "Invalid PME order");
return PME_SPREADGATHER_ATOMS_PER_WARP;
+#else
+ GMX_THROW(gmx::NotImplementedError("Atom alignment per warp has to be deduced dynamically for OpenCL"));
+ GMX_UNUSED_VALUE(pmeGpu);
+#endif
}
void pme_gpu_synchronize(const PmeGpu *pmeGpu)
{
- cudaError_t stat = cudaStreamSynchronize(pmeGpu->archSpecific->pmeStream);
- CU_RET_ERR(stat, "Failed to synchronize the PME GPU stream!");
+ gpuStreamSynchronize(pmeGpu->archSpecific->pmeStream);
}
void pme_gpu_alloc_energy_virial(const PmeGpu *pmeGpu)
const int newFractShiftsSize = cellCount * (nx + ny + nz);
+#if GMX_GPU == GMX_GPU_CUDA
initParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
kernelParamsPtr->fractShiftsTableTexture,
pmeGpu->common->fsh.data(),
pmeGpu->common->nn.data(),
newFractShiftsSize,
pmeGpu->deviceInfo);
+#elif GMX_GPU == GMX_GPU_OPENCL
+ // No dedicated texture routines....
+ allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize, pmeGpu->archSpecific->context);
+ allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize, pmeGpu->archSpecific->context);
+ copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(),
+ 0, newFractShiftsSize,
+ pmeGpu->archSpecific->pmeStream, GpuApiCallBehavior::Async, nullptr);
+ copyToDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, pmeGpu->common->nn.data(),
+ 0, newFractShiftsSize,
+ pmeGpu->archSpecific->pmeStream, GpuApiCallBehavior::Async, nullptr);
+#endif
}
void pme_gpu_free_fract_shifts(const PmeGpu *pmeGpu)
{
auto *kernelParamsPtr = pmeGpu->kernelParams.get();
+#if GMX_GPU == GMX_GPU_CUDA
destroyParamLookupTable(kernelParamsPtr->grid.d_fractShiftsTable,
kernelParamsPtr->fractShiftsTableTexture,
pmeGpu->deviceInfo);
destroyParamLookupTable(kernelParamsPtr->grid.d_gridlineIndicesTable,
kernelParamsPtr->gridlineIndicesTableTexture,
pmeGpu->deviceInfo);
+#elif GMX_GPU == GMX_GPU_OPENCL
+ freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
+ freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
+#endif
}
bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
* TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
*/
- /* WARNING: CUDA timings are incorrect with multiple streams.
- * This is the main reason why they are disabled by default.
- */
- // TODO: Consider turning on by default when we can detect nr of streams.
- pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
+ if (GMX_GPU == GMX_GPU_CUDA)
+ {
+ /* WARNING: CUDA timings are incorrect with multiple streams.
+ * This is the main reason why they are disabled by default.
+ */
+ // TODO: Consider turning on by default when we can detect nr of streams.
+ pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ }
+ else if (GMX_GPU == GMX_GPU_OPENCL)
+ {
+ pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+ }
// TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
+#if GMX_GPU == GMX_GPU_CUDA
// Prepare to use the device that this PME task was assigned earlier.
CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
+#endif
+#if GMX_GPU == GMX_GPU_CUDA
pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+#elif GMX_GPU == GMX_GPU_OPENCL
+ //TODO we'll need work size checks for OpenCL too
+#endif
- /* Creating a PME CUDA stream */
+ /* Creating a PME GPU stream:
+ * - default high priority with CUDA
+ * - no priorities implemented yet with OpenCL; see #2532
+ */
+#if GMX_GPU == GMX_GPU_CUDA
cudaError_t stat;
int highest_priority, lowest_priority;
stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
cudaStreamDefault, //cudaStreamNonBlocking,
highest_priority);
CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
+#elif GMX_GPU == GMX_GPU_OPENCL
+ cl_command_queue_properties queueProperties = pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
+ cl_device_id device_id = pmeGpu->deviceInfo->ocl_gpu_id.ocl_device_id;
+ cl_int clError;
+ pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(pmeGpu->archSpecific->context,
+ device_id, queueProperties, &clError);
+ if (clError != CL_SUCCESS)
+ {
+ GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
+ }
+#endif
}
void pme_gpu_destroy_specific(const PmeGpu *pmeGpu)
{
+#if GMX_GPU == GMX_GPU_CUDA
/* Destroy the CUDA stream */
cudaError_t stat = cudaStreamDestroy(pmeGpu->archSpecific->pmeStream);
CU_RET_ERR(stat, "PME cudaStreamDestroy error");
+#elif GMX_GPU == GMX_GPU_OPENCL
+ cl_int clError = clReleaseCommandQueue(pmeGpu->archSpecific->pmeStream);
+ if (clError != CL_SUCCESS)
+ {
+ gmx_warning("Failed to destroy PME command queue");
+ }
+#endif
}
void pme_gpu_reinit_3dfft(const PmeGpu *pmeGpu)
#define GMX_EWALD_PME_GPU_INTERNAL_H
#include "gromacs/fft/fft.h" // for the gmx_fft_direction enum
-#include "gromacs/gpu_utils/gpu_macros.h" // for the CUDA_FUNC_ macros
+#include "gromacs/gpu_utils/gpu_macros.h" // for the GPU_FUNC_ macros
#include "gromacs/utility/arrayref.h"
#include "pme-gpu-types-host.h" // for the inline functions accessing PmeGpu members
*
* \param[in] pmeGpu The PME GPU structure.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_synchronize(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM
/*! \libinternal \brief
* Allocates the fixed size energy and virial buffer both on GPU and CPU.
*
* Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- const rvec *CUDA_FUNC_ARGUMENT(h_coordinates)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ const rvec *GPU_FUNC_ARGUMENT(h_coordinates)) GPU_FUNC_TERM
/*! \libinternal \brief
* Frees the coordinates on the GPU.
* \param[in] pmeGpu The PME GPU structure.
* \returns The input/output forces.
*/
-CUDA_FUNC_QUALIFIER gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(gmx::EmptyArrayRef())
+GPU_FUNC_QUALIFIER gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(gmx::EmptyArrayRef())
/*! \libinternal \brief
* Returns the output virial and energy of the PME solving.
* \param[out] energy The output energy.
* \param[out] virial The output virial matrix.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_get_energy_virial(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- real *CUDA_FUNC_ARGUMENT(energy),
- matrix CUDA_FUNC_ARGUMENT(virial)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_get_energy_virial(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ real *GPU_FUNC_ARGUMENT(energy),
+ matrix GPU_FUNC_ARGUMENT(virial)) GPU_FUNC_TERM
/*! \libinternal \brief
* Updates the unit cell parameters. Does not check if update is necessary - that is done in pme_gpu_prepare_computation().
* \param[in] pmeGpu The PME GPU structure.
* \param[in] box The unit cell box.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- const matrix CUDA_FUNC_ARGUMENT(box)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_update_input_box(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ const matrix GPU_FUNC_ARGUMENT(box)) GPU_FUNC_TERM
/*! \libinternal \brief
* Finishes the PME GPU computation, waiting for the output forces and/or energy/virial to be copied to the host.
* \param[in] dimIndex Dimension index.
* \param[in] transform Layout transform type
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- const pme_atomcomm_t *CUDA_FUNC_ARGUMENT(atc),
- PmeSplineDataType CUDA_FUNC_ARGUMENT(type),
- int CUDA_FUNC_ARGUMENT(dimIndex),
- PmeLayoutTransform CUDA_FUNC_ARGUMENT(transform)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_transform_spline_atom_data(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ const pme_atomcomm_t *GPU_FUNC_ARGUMENT(atc),
+ PmeSplineDataType GPU_FUNC_ARGUMENT(type),
+ int GPU_FUNC_ARGUMENT(dimIndex),
+ PmeLayoutTransform GPU_FUNC_ARGUMENT(transform)) GPU_FUNC_TERM
/*! \libinternal \brief
* Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
* \param[out] gridSize Pointer to the grid dimensions to fill in.
* \param[out] paddedGridSize Pointer to the padded grid dimensions to fill in.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- gmx::IVec *CUDA_FUNC_ARGUMENT(gridSize),
- gmx::IVec *CUDA_FUNC_ARGUMENT(paddedGridSize)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ gmx::IVec *GPU_FUNC_ARGUMENT(gridSize),
+ gmx::IVec *GPU_FUNC_ARGUMENT(paddedGridSize)) GPU_FUNC_TERM
/*! \libinternal \brief
* (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
* \param[in] pmeGpuProgram The PME GPU program data
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_device_info_t *CUDA_FUNC_ARGUMENT(gpuInfo),
- PmeGpuProgramHandle CUDA_FUNC_ARGUMENT(pmeGpuProgram)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_device_info_t *GPU_FUNC_ARGUMENT(gpuInfo),
+ PmeGpuProgramHandle GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM
/*! \libinternal \brief
* Destroys the PME GPU data at the end of the run.
*
* \param[in] pmeGpu The PME GPU structure.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM
/*! \libinternal \brief
* Reallocates the local atoms data (charges, coordinates, etc.). Copies the charges to the GPU.
* This is a function that should only be called in the beginning of the run and on domain decomposition.
* Should be called before the pme_gpu_set_io_ranges.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu),
- const int CUDA_FUNC_ARGUMENT(nAtoms),
- const real *CUDA_FUNC_ARGUMENT(charges)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
+ const int GPU_FUNC_ARGUMENT(nAtoms),
+ const real *GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM
/*! \brief \libinternal
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
/*! \brief
* Factory function used to build persistent PME GPU program for the device at once.
- * \todo This should shortly become GPU_FUNC to support OpenCL.
*/
PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *);
struct PmeGpuCudaKernelParams;
/*! \brief A typedef for including the GPU kernel arguments data by pointer */
typedef PmeGpuCudaKernelParams PmeGpuKernelParams;
-
+#elif GMX_GPU == GMX_GPU_OPENCL
+struct PmeGpuKernelParamsBase;
+/*! \brief A typedef for including the GPU kernel arguments data by pointer */
+typedef PmeGpuKernelParamsBase PmeGpuKernelParams;
#else
-
/*! \brief A dummy typedef for the GPU kernel arguments data placeholder on non-GPU builds */
typedef int PmeGpuKernelParams;
-
#endif
struct gmx_device_info_t;
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_PME_GPU_UTILS_H
+#define GMX_EWALD_PME_GPU_UTILS_H
+
+/*! \internal \file
+ * \brief This file defines the small PME GPU inline host/device functions.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include "config.h"
+
+#include <cassert>
+
+#include "pme-gpu-constants.h"
+
+//! A macro for inline GPU functions.
+#if GMX_GPU == GMX_GPU_CUDA
+#define INLINE_EVERYWHERE __host__ __device__ __forceinline__
+#else
+#define INLINE_EVERYWHERE inline
+#endif
+
+/*! \internal \brief
+ * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
+ * Feed the result into getSplineParamIndex() to get a full index.
+ * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
+ * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
+ * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
+ *
+ * \tparam order PME order
+ * \param[in] warpIndex Warp index wrt the block.
+ * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1).
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template <int order>
+int INLINE_EVERYWHERE getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
+{
+ assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP));
+ const int dimIndex = 0;
+ const int splineIndex = 0;
+ // The zeroes are here to preserve the full index formula for reference
+ return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+}
+
+/*! \internal \brief
+ * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
+ * in range(0, atomsPerBlock * order * DIM).
+ * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
+ *
+ * \tparam order PME order
+ * \param[in] paramIndexBase Must be result of getSplineParamIndexBase().
+ * \param[in] dimIndex Dimension index (from 0 to 2)
+ * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template <int order>
+int INLINE_EVERYWHERE getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+{
+ assert((dimIndex >= XX) && (dimIndex < DIM));
+ assert((splineIndex >= 0) && (splineIndex < order));
+ return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP);
+}
+
+#endif
#include "pme.cuh"
#include "pme-gpu-program-impl.h"
#include "pme-gpu-timings.h"
+#include "pme-gpu-utils.h"
#include "pme-grid.h"
/*
#include "pme-gpu-types-host.h"
#include "pme-gpu-types-host-impl.h"
-/*! \internal \brief
- * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
- * Feed the result into getSplineParamIndex() to get a full index.
- * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
- * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
- * Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
- *
- * \tparam order PME order
- * \param[in] warpIndex Warp index wrt the block.
- * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1).
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-template <int order>
-int __host__ __device__ __forceinline__ getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
-{
- assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP));
- const int dimIndex = 0;
- const int splineIndex = 0;
- // The zeroes are here to preserve the full index formula for reference
- return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
-}
-
-/*! \internal \brief
- * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
- * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
- * in range(0, atomsPerBlock * order * DIM).
- * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
- *
- * \tparam order PME order
- * \param[in] paramIndexBase Must be result of getSplineParamIndexBase().
- * \param[in] dimIndex Dimension index (from 0 to 2)
- * \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
- *
- * \returns Index into theta or dtheta array using GPU layout.
- */
-template <int order>
-int __host__ __device__ __forceinline__ getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
-{
- assert((dimIndex >= XX) && (dimIndex < DIM));
- assert((splineIndex >= 0) && (splineIndex < order));
- return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP);
-}
-
/*! \brief \internal
* An inline CUDA function for checking the global atom data indices against the atom data array sizes.
*
*
* \param[in] pme The PME structure.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_reset_timings(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reset_timings(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM
/*! \brief
* Copies the PME GPU timings to the gmx_wallclock_gpu_pme_t structure (for log output). To be called at the run end.
* \param[in] pme The PME structure.
* \param[in] timings The gmx_wallclock_gpu_pme_t structure.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_get_timings(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_wallclock_gpu_pme_t *CUDA_FUNC_ARGUMENT(timings)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_get_timings(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallclock_gpu_pme_t *GPU_FUNC_ARGUMENT(timings)) GPU_FUNC_TERM
/* The main PME GPU functions */
* \param[in] flags The combination of flags to affect this PME computation.
* The flags are the GMX_PME_ flags from pme.h.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- bool CUDA_FUNC_ARGUMENT(needToUpdateBox),
- const matrix CUDA_FUNC_ARGUMENT(box),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
- int CUDA_FUNC_ARGUMENT(flags)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ bool GPU_FUNC_ARGUMENT(needToUpdateBox),
+ const matrix GPU_FUNC_ARGUMENT(box),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle),
+ int GPU_FUNC_ARGUMENT(flags)) GPU_FUNC_TERM
/*! \brief
* Launches first stage of PME on GPU - H2D input transfers, spreading kernel, and D2H grid transfer if needed.
* \param[in] x The array of local atoms' coordinates.
* \param[in] wcycle The wallclock counter.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- const rvec *CUDA_FUNC_ARGUMENT(x),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ const rvec *GPU_FUNC_ARGUMENT(x),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
* \param[in] pme The PME data structure.
* \param[in] wcycle The wallclock counter.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
/*! \brief
* Launches last stage of PME on GPU - force gathering and D2H force transfer.
* the output reciprocal forces into the host array, or copies its contents to the GPU first
* and accumulates. The reduction is non-atomic.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
- PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle),
+ PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment)) GPU_FUNC_TERM
/*! \brief
* Blocks until PME GPU tasks are completed, and gets the output forces and virial/energy
* \param[out] virial The output virial matrix.
* \param[out] energy The output energy.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_wait_finish_task(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
- gmx::ArrayRef<const gmx::RVec> *CUDA_FUNC_ARGUMENT(forces),
- matrix CUDA_FUNC_ARGUMENT(virial),
- real *CUDA_FUNC_ARGUMENT(energy)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_wait_finish_task(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle),
+ gmx::ArrayRef<const gmx::RVec> *GPU_FUNC_ARGUMENT(forces),
+ matrix GPU_FUNC_ARGUMENT(virial),
+ real *GPU_FUNC_ARGUMENT(energy)) GPU_FUNC_TERM
/*! \brief
* Attempts to complete PME GPU tasks.
*
* \param[in] completionKind Indicates whether PME task completion should only be checked rather than waited for
* \returns True if the PME GPU tasks have completed
*/
-CUDA_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle),
- gmx::ArrayRef<const gmx::RVec> *CUDA_FUNC_ARGUMENT(forces),
- matrix CUDA_FUNC_ARGUMENT(virial),
- real *CUDA_FUNC_ARGUMENT(energy),
- GpuTaskCompletion CUDA_FUNC_ARGUMENT(completionKind)) CUDA_FUNC_TERM_WITH_RETURN(false)
+GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle),
+ gmx::ArrayRef<const gmx::RVec> *GPU_FUNC_ARGUMENT(forces),
+ matrix GPU_FUNC_ARGUMENT(virial),
+ real *GPU_FUNC_ARGUMENT(energy),
+ GpuTaskCompletion GPU_FUNC_ARGUMENT(completionKind)) GPU_FUNC_TERM_WITH_RETURN(false)
/*! \brief
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
* \param[in] pme The PME data structure.
* \param[in] wcycle The wallclock counter.
*/
-CUDA_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *CUDA_FUNC_ARGUMENT(pme),
- gmx_wallcycle *CUDA_FUNC_ARGUMENT(wcycle)) CUDA_FUNC_TERM
+GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
+ gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM
#endif